1 //========================================================================
2 //
3 // TextOutputDev.cc
4 //
5 // Copyright 1997-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 #include <aconf.h>
10 
11 #ifdef USE_GCC_PRAGMAS
12 #pragma implementation
13 #endif
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <stddef.h>
18 #include <math.h>
19 #include <ctype.h>
20 #ifdef WIN32
21 #include <fcntl.h> // for O_BINARY
22 #include <io.h>    // for setmode
23 #endif
24 #include "gmem.h"
25 #include "GString.h"
26 #include "GList.h"
27 #include "config.h"
28 #include "Error.h"
29 #include "GlobalParams.h"
30 #include "UnicodeMap.h"
31 #include "UnicodeTypeTable.h"
32 #include "GfxState.h"
33 #include "TextOutputDev.h"
34 
35 #ifdef MACOS
36 // needed for setting type/creator of MacOS files
37 #include "ICSupport.h"
38 #endif
39 
40 //------------------------------------------------------------------------
41 // parameters
42 //------------------------------------------------------------------------
43 
44 // Each bucket in a text pool includes baselines within a range of
45 // this many points.
46 #define textPoolStep 4
47 
48 // Inter-character space width which will cause addChar to start a new
49 // word.
50 #define minWordBreakSpace 0.1
51 
52 // Negative inter-character space width, i.e., overlap, which will
53 // cause addChar to start a new word.
54 #define minDupBreakOverlap 0.2
55 
56 // Max distance between baselines of two lines within a block, as a
57 // fraction of the font size.
58 #define maxLineSpacingDelta 1.5
59 
60 // Max difference in primary font sizes on two lines in the same
61 // block.  Delta1 is used when examining new lines above and below the
62 // current block; delta2 is used when examining text that overlaps the
63 // current block; delta3 is used when examining text to the left and
64 // right of the current block.
65 #define maxBlockFontSizeDelta1 0.05
66 #define maxBlockFontSizeDelta2 0.6
67 #define maxBlockFontSizeDelta3 0.2
68 
69 // Max difference in font sizes inside a word.
70 #define maxWordFontSizeDelta 0.05
71 
72 // Maximum distance between baselines of two words on the same line,
73 // e.g., distance between subscript or superscript and the primary
74 // baseline, as a fraction of the font size.
75 #define maxIntraLineDelta 0.5
76 
77 // Minimum inter-word spacing, as a fraction of the font size.  (Only
78 // used for raw ordering.)
79 #define minWordSpacing 0.15
80 
81 // Maximum inter-word spacing, as a fraction of the font size.
82 #define maxWordSpacing 1.5
83 
84 // Maximum horizontal spacing which will allow a word to be pulled
85 // into a block.
86 #define minColSpacing1 0.3
87 
88 // Minimum spacing between columns, as a fraction of the font size.
89 #define minColSpacing2 1.0
90 
91 // Maximum vertical spacing between blocks within a flow, as a
92 // multiple of the font size.
93 #define maxBlockSpacing 2.5
94 
95 // Minimum spacing between characters within a word, as a fraction of
96 // the font size.
97 #define minCharSpacing -0.2
98 
99 // Maximum spacing between characters within a word, as a fraction of
100 // the font size, when there is no obvious extra-wide character
101 // spacing.
102 #define maxCharSpacing 0.03
103 
104 // When extra-wide character spacing is detected, the inter-character
105 // space threshold is set to the minimum inter-character space
106 // multiplied by this constant.
107 #define maxWideCharSpacingMul 1.3
108 
109 // Max difference in primary,secondary coordinates (as a fraction of
110 // the font size) allowed for duplicated text (fake boldface, drop
111 // shadows) which is to be discarded.
112 #define dupMaxPriDelta 0.1
113 #define dupMaxSecDelta 0.2
114 
115 //------------------------------------------------------------------------
116 // TextFontInfo
117 //------------------------------------------------------------------------
118 
TextFontInfo(GfxState * state)119 TextFontInfo::TextFontInfo(GfxState *state) {
120   gfxFont = state->getFont();
121 #if TEXTOUT_WORD_LIST
122   fontName = (gfxFont && gfxFont->getOrigName())
123                  ? gfxFont->getOrigName()->copy()
124                  : (GString *)NULL;
125 #endif
126 }
127 
~TextFontInfo()128 TextFontInfo::~TextFontInfo() {
129 #if TEXTOUT_WORD_LIST
130   if (fontName) {
131     delete fontName;
132   }
133 #endif
134 }
135 
matches(GfxState * state)136 GBool TextFontInfo::matches(GfxState *state) {
137   return state->getFont() == gfxFont;
138 }
139 
140 //------------------------------------------------------------------------
141 // TextWord
142 //------------------------------------------------------------------------
143 
TextWord(GfxState * state,int rotA,double x0,double y0,int charPosA,TextFontInfo * fontA,double fontSizeA)144 TextWord::TextWord(GfxState *state, int rotA, double x0, double y0,
145 		   int charPosA, TextFontInfo *fontA, double fontSizeA) {
146   GfxFont *gfxFont;
147   double x, y, ascent, descent;
148 
149   rot = rotA;
150   charPos = charPosA;
151   charLen = 0;
152   font = fontA;
153   fontSize = fontSizeA;
154   state->transform(x0, y0, &x, &y);
155   if ((gfxFont = font->gfxFont)) {
156     ascent = gfxFont->getAscent() * fontSize;
157     descent = gfxFont->getDescent() * fontSize;
158   } else {
159     // this means that the PDF file draws text without a current font,
160     // which should never happen
161     ascent = 0.95 * fontSize;
162     descent = -0.35 * fontSize;
163   }
164   switch (rot) {
165   case 0:
166     yMin = y - ascent;
167     yMax = y - descent;
168     if (yMin == yMax) {
169       // this is a sanity check for a case that shouldn't happen -- but
170       // if it does happen, we want to avoid dividing by zero later
171       yMin = y;
172       yMax = y + 1;
173     }
174     base = y;
175     break;
176   case 1:
177     xMin = x + descent;
178     xMax = x + ascent;
179     if (xMin == xMax) {
180       // this is a sanity check for a case that shouldn't happen -- but
181       // if it does happen, we want to avoid dividing by zero later
182       xMin = x;
183       xMax = x + 1;
184     }
185     base = x;
186     break;
187   case 2:
188     yMin = y + descent;
189     yMax = y + ascent;
190     if (yMin == yMax) {
191       // this is a sanity check for a case that shouldn't happen -- but
192       // if it does happen, we want to avoid dividing by zero later
193       yMin = y;
194       yMax = y + 1;
195     }
196     base = y;
197     break;
198   case 3:
199     xMin = x - ascent;
200     xMax = x - descent;
201     if (xMin == xMax) {
202       // this is a sanity check for a case that shouldn't happen -- but
203       // if it does happen, we want to avoid dividing by zero later
204       xMin = x;
205       xMax = x + 1;
206     }
207     base = x;
208     break;
209   }
210   text = NULL;
211   edge = NULL;
212   len = size = 0;
213   spaceAfter = gFalse;
214   next = NULL;
215 
216 #if TEXTOUT_WORD_LIST
217   GfxRGB rgb;
218 
219   if ((state->getRender() & 3) == 1) {
220     state->getStrokeRGB(&rgb);
221   } else {
222     state->getFillRGB(&rgb);
223   }
224   colorR = colToDbl(rgb.r);
225   colorG = colToDbl(rgb.g);
226   colorB = colToDbl(rgb.b);
227 #endif
228 }
229 
~TextWord()230 TextWord::~TextWord() {
231   gfree(text);
232   gfree(edge);
233 }
234 
addChar(GfxState * state,double x,double y,double dx,double dy,Unicode u)235 void TextWord::addChar(GfxState *state, double x, double y,
236 		       double dx, double dy, Unicode u) {
237   if (len == size) {
238     size += 16;
239     text = (Unicode *)greallocn(text, size, sizeof(Unicode));
240     edge = (double *)greallocn(edge, size + 1, sizeof(double));
241   }
242   text[len] = u;
243   switch (rot) {
244   case 0:
245     if (len == 0) {
246       xMin = x;
247     }
248     edge[len] = x;
249     xMax = edge[len+1] = x + dx;
250     break;
251   case 1:
252     if (len == 0) {
253       yMin = y;
254     }
255     edge[len] = y;
256     yMax = edge[len+1] = y + dy;
257     break;
258   case 2:
259     if (len == 0) {
260       xMax = x;
261     }
262     edge[len] = x;
263     xMin = edge[len+1] = x + dx;
264     break;
265   case 3:
266     if (len == 0) {
267       yMax = y;
268     }
269     edge[len] = y;
270     yMin = edge[len+1] = y + dy;
271     break;
272   }
273   ++len;
274 }
275 
merge(TextWord * word)276 void TextWord::merge(TextWord *word) {
277   int i;
278 
279   if (word->xMin < xMin) {
280     xMin = word->xMin;
281   }
282   if (word->yMin < yMin) {
283     yMin = word->yMin;
284   }
285   if (word->xMax > xMax) {
286     xMax = word->xMax;
287   }
288   if (word->yMax > yMax) {
289     yMax = word->yMax;
290   }
291   if (len + word->len > size) {
292     size = len + word->len;
293     text = (Unicode *)greallocn(text, size, sizeof(Unicode));
294     edge = (double *)greallocn(edge, size + 1, sizeof(double));
295   }
296   for (i = 0; i < word->len; ++i) {
297     text[len + i] = word->text[i];
298     edge[len + i] = word->edge[i];
299   }
300   edge[len + word->len] = word->edge[word->len];
301   len += word->len;
302   charLen += word->charLen;
303 }
304 
primaryCmp(TextWord * word)305 inline int TextWord::primaryCmp(TextWord *word) {
306   double cmp;
307 
308   cmp = 0; // make gcc happy
309   switch (rot) {
310   case 0:
311     cmp = xMin - word->xMin;
312     break;
313   case 1:
314     cmp = yMin - word->yMin;
315     break;
316   case 2:
317     cmp = word->xMax - xMax;
318     break;
319   case 3:
320     cmp = word->yMax - yMax;
321     break;
322   }
323   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
324 }
325 
primaryDelta(TextWord * word)326 double TextWord::primaryDelta(TextWord *word) {
327   double delta;
328 
329   delta = 0; // make gcc happy
330   switch (rot) {
331   case 0:
332     delta = word->xMin - xMax;
333     break;
334   case 1:
335     delta = word->yMin - yMax;
336     break;
337   case 2:
338     delta = xMin - word->xMax;
339     break;
340   case 3:
341     delta = yMin - word->yMax;
342     break;
343   }
344   return delta;
345 }
346 
cmpYX(const void * p1,const void * p2)347 int TextWord::cmpYX(const void *p1, const void *p2) {
348   TextWord *word1 = *(TextWord **)p1;
349   TextWord *word2 = *(TextWord **)p2;
350   double cmp;
351 
352   cmp = word1->yMin - word2->yMin;
353   if (cmp == 0) {
354     cmp = word1->xMin - word2->xMin;
355   }
356   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
357 }
358 
359 #if TEXTOUT_WORD_LIST
360 
getText()361 GString *TextWord::getText() {
362   GString *s;
363   UnicodeMap *uMap;
364   char buf[8];
365   int n, i;
366 
367   s = new GString();
368   if (!(uMap = globalParams->getTextEncoding())) {
369     return s;
370   }
371   for (i = 0; i < len; ++i) {
372     n = uMap->mapUnicode(text[i], buf, sizeof(buf));
373     s->append(buf, n);
374   }
375   uMap->decRefCnt();
376   return s;
377 }
378 
379 #endif // TEXTOUT_WORD_LIST
380 
381 //------------------------------------------------------------------------
382 // TextPool
383 //------------------------------------------------------------------------
384 
TextPool()385 TextPool::TextPool() {
386   minBaseIdx = 0;
387   maxBaseIdx = -1;
388   pool = NULL;
389   cursor = NULL;
390   cursorBaseIdx = -1;
391 }
392 
~TextPool()393 TextPool::~TextPool() {
394   int baseIdx;
395   TextWord *word, *word2;
396 
397   for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
398     for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
399       word2 = word->next;
400       delete word;
401     }
402   }
403   gfree(pool);
404 }
405 
getBaseIdx(double base)406 int TextPool::getBaseIdx(double base) {
407   int baseIdx;
408 
409   baseIdx = (int)(base / textPoolStep);
410   if (baseIdx < minBaseIdx) {
411     return minBaseIdx;
412   }
413   if (baseIdx > maxBaseIdx) {
414     return maxBaseIdx;
415   }
416   return baseIdx;
417 }
418 
addWord(TextWord * word)419 void TextPool::addWord(TextWord *word) {
420   TextWord **newPool;
421   int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
422   TextWord *w0, *w1;
423 
424   // expand the array if needed
425   wordBaseIdx = (int)(word->base / textPoolStep);
426   if (minBaseIdx > maxBaseIdx) {
427     minBaseIdx = wordBaseIdx - 128;
428     maxBaseIdx = wordBaseIdx + 128;
429     pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1,
430 				 sizeof(TextWord *));
431     for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
432       pool[baseIdx - minBaseIdx] = NULL;
433     }
434   } else if (wordBaseIdx < minBaseIdx) {
435     newMinBaseIdx = wordBaseIdx - 128;
436     newPool = (TextWord **)gmallocn(maxBaseIdx - newMinBaseIdx + 1,
437 				    sizeof(TextWord *));
438     for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
439       newPool[baseIdx - newMinBaseIdx] = NULL;
440     }
441     memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
442 	   (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
443     gfree(pool);
444     pool = newPool;
445     minBaseIdx = newMinBaseIdx;
446   } else if (wordBaseIdx > maxBaseIdx) {
447     newMaxBaseIdx = wordBaseIdx + 128;
448     pool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1,
449 				  sizeof(TextWord *));
450     for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
451       pool[baseIdx - minBaseIdx] = NULL;
452     }
453     maxBaseIdx = newMaxBaseIdx;
454   }
455 
456   // insert the new word
457   if (cursor && wordBaseIdx == cursorBaseIdx &&
458       word->primaryCmp(cursor) > 0) {
459     w0 = cursor;
460     w1 = cursor->next;
461   } else {
462     w0 = NULL;
463     w1 = pool[wordBaseIdx - minBaseIdx];
464   }
465   for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
466   word->next = w1;
467   if (w0) {
468     w0->next = word;
469   } else {
470     pool[wordBaseIdx - minBaseIdx] = word;
471   }
472   cursor = word;
473   cursorBaseIdx = wordBaseIdx;
474 }
475 
476 //------------------------------------------------------------------------
477 // TextLine
478 //------------------------------------------------------------------------
479 
TextLine(TextBlock * blkA,int rotA,double baseA)480 TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
481   blk = blkA;
482   rot = rotA;
483   xMin = yMin = 0;
484   xMax = yMax = -1;
485   base = baseA;
486   words = lastWord = NULL;
487   text = NULL;
488   edge = NULL;
489   col = NULL;
490   len = 0;
491   convertedLen = 0;
492   hyphenated = gFalse;
493   next = NULL;
494 }
495 
~TextLine()496 TextLine::~TextLine() {
497   TextWord *word;
498 
499   while (words) {
500     word = words;
501     words = words->next;
502     delete word;
503   }
504   gfree(text);
505   gfree(edge);
506   gfree(col);
507 }
508 
addWord(TextWord * word)509 void TextLine::addWord(TextWord *word) {
510   if (lastWord) {
511     lastWord->next = word;
512   } else {
513     words = word;
514   }
515   lastWord = word;
516 
517   if (xMin > xMax) {
518     xMin = word->xMin;
519     xMax = word->xMax;
520     yMin = word->yMin;
521     yMax = word->yMax;
522   } else {
523     if (word->xMin < xMin) {
524       xMin = word->xMin;
525     }
526     if (word->xMax > xMax) {
527       xMax = word->xMax;
528     }
529     if (word->yMin < yMin) {
530       yMin = word->yMin;
531     }
532     if (word->yMax > yMax) {
533       yMax = word->yMax;
534     }
535   }
536 }
537 
primaryDelta(TextLine * line)538 double TextLine::primaryDelta(TextLine *line) {
539   double delta;
540 
541   delta = 0; // make gcc happy
542   switch (rot) {
543   case 0:
544     delta = line->xMin - xMax;
545     break;
546   case 1:
547     delta = line->yMin - yMax;
548     break;
549   case 2:
550     delta = xMin - line->xMax;
551     break;
552   case 3:
553     delta = yMin - line->yMax;
554     break;
555   }
556   return delta;
557 }
558 
primaryCmp(TextLine * line)559 int TextLine::primaryCmp(TextLine *line) {
560   double cmp;
561 
562   cmp = 0; // make gcc happy
563   switch (rot) {
564   case 0:
565     cmp = xMin - line->xMin;
566     break;
567   case 1:
568     cmp = yMin - line->yMin;
569     break;
570   case 2:
571     cmp = line->xMax - xMax;
572     break;
573   case 3:
574     cmp = line->yMax - yMax;
575     break;
576   }
577   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
578 }
579 
secondaryCmp(TextLine * line)580 int TextLine::secondaryCmp(TextLine *line) {
581   double cmp;
582 
583   cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
584   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
585 }
586 
cmpYX(TextLine * line)587 int TextLine::cmpYX(TextLine *line) {
588   int cmp;
589 
590   if ((cmp = secondaryCmp(line))) {
591     return cmp;
592   }
593   return primaryCmp(line);
594 }
595 
cmpXY(const void * p1,const void * p2)596 int TextLine::cmpXY(const void *p1, const void *p2) {
597   TextLine *line1 = *(TextLine **)p1;
598   TextLine *line2 = *(TextLine **)p2;
599   int cmp;
600 
601   if ((cmp = line1->primaryCmp(line2))) {
602     return cmp;
603   }
604   return line1->secondaryCmp(line2);
605 }
606 
coalesce(UnicodeMap * uMap)607 void TextLine::coalesce(UnicodeMap *uMap) {
608   TextWord *word0, *word1;
609   double space, delta, minSpace;
610   GBool isUnicode;
611   char buf[8];
612   int i, j;
613 
614   if (words->next) {
615 
616     // compute the inter-word space threshold
617     if (words->len > 1 || words->next->len > 1) {
618       minSpace = 0;
619     } else {
620       minSpace = words->primaryDelta(words->next);
621       for (word0 = words->next, word1 = word0->next;
622 	   word1 && minSpace > 0;
623 	   word0 = word1, word1 = word0->next) {
624 	if (word1->len > 1) {
625 	  minSpace = 0;
626 	}
627 	delta = word0->primaryDelta(word1);
628 	if (delta < minSpace) {
629 	  minSpace = delta;
630 	}
631       }
632     }
633     if (minSpace <= 0) {
634       space = maxCharSpacing * words->fontSize;
635     } else {
636       space = maxWideCharSpacingMul * minSpace;
637     }
638 
639     // merge words
640     word0 = words;
641     word1 = words->next;
642     while (word1) {
643       if (word0->primaryDelta(word1) >= space) {
644 	word0->spaceAfter = gTrue;
645 	word0 = word1;
646 	word1 = word1->next;
647       } else if (word0->font == word1->font &&
648 		 fabs(word0->fontSize - word1->fontSize) <
649 		 maxWordFontSizeDelta * words->fontSize &&
650 		 word1->charPos == word0->charPos + word0->charLen) {
651 	word0->merge(word1);
652 	word0->next = word1->next;
653 	delete word1;
654 	word1 = word0->next;
655       } else {
656 	word0 = word1;
657 	word1 = word1->next;
658       }
659     }
660   }
661 
662   // build the line text
663   isUnicode = uMap ? uMap->isUnicode() : gFalse;
664   len = 0;
665   for (word1 = words; word1; word1 = word1->next) {
666     len += word1->len;
667     if (word1->spaceAfter) {
668       ++len;
669     }
670   }
671   text = (Unicode *)gmallocn(len, sizeof(Unicode));
672   edge = (double *)gmallocn(len + 1, sizeof(double));
673   i = 0;
674   for (word1 = words; word1; word1 = word1->next) {
675     for (j = 0; j < word1->len; ++j) {
676       text[i] = word1->text[j];
677       edge[i] = word1->edge[j];
678       ++i;
679     }
680     edge[i] = word1->edge[word1->len];
681     if (word1->spaceAfter) {
682       text[i] = (Unicode)0x0020;
683       ++i;
684     }
685   }
686 
687   // compute convertedLen and set up the col array
688   col = (int *)gmallocn(len + 1, sizeof(int));
689   convertedLen = 0;
690   for (i = 0; i < len; ++i) {
691     col[i] = convertedLen;
692     if (isUnicode) {
693       ++convertedLen;
694     } else if (uMap) {
695       convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
696     }
697   }
698   col[len] = convertedLen;
699 
700   // check for hyphen at end of line
701   //~ need to check for other chars used as hyphens
702   hyphenated = text[len - 1] == (Unicode)'-';
703 }
704 
705 //------------------------------------------------------------------------
706 // TextLineFrag
707 //------------------------------------------------------------------------
708 
709 class TextLineFrag {
710 public:
711 
712   TextLine *line;		// the line object
713   int start, len;		// offset and length of this fragment
714 				//   (in Unicode chars)
715   double xMin, xMax;		// bounding box coordinates
716   double yMin, yMax;
717   double base;			// baseline virtual coordinate
718   int col;			// first column
719 
720   void init(TextLine *lineA, int startA, int lenA);
721   void computeCoords(GBool oneRot);
722 
723   static int cmpYXPrimaryRot(const void *p1, const void *p2);
724   static int cmpYXLineRot(const void *p1, const void *p2);
725   static int cmpXYLineRot(const void *p1, const void *p2);
726 };
727 
init(TextLine * lineA,int startA,int lenA)728 void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
729   line = lineA;
730   start = startA;
731   len = lenA;
732   col = line->col[start];
733 }
734 
computeCoords(GBool oneRot)735 void TextLineFrag::computeCoords(GBool oneRot) {
736   TextBlock *blk;
737   double d0, d1, d2, d3, d4;
738 
739   if (oneRot) {
740 
741     switch (line->rot) {
742     case 0:
743       xMin = line->edge[start];
744       xMax = line->edge[start + len];
745       yMin = line->yMin;
746       yMax = line->yMax;
747       break;
748     case 1:
749       xMin = line->xMin;
750       xMax = line->xMax;
751       yMin = line->edge[start];
752       yMax = line->edge[start + len];
753       break;
754     case 2:
755       xMin = line->edge[start + len];
756       xMax = line->edge[start];
757       yMin = line->yMin;
758       yMax = line->yMax;
759       break;
760     case 3:
761       xMin = line->xMin;
762       xMax = line->xMax;
763       yMin = line->edge[start + len];
764       yMax = line->edge[start];
765       break;
766     }
767     base = line->base;
768 
769   } else {
770 
771     if (line->rot == 0 && line->blk->page->primaryRot == 0) {
772 
773       xMin = line->edge[start];
774       xMax = line->edge[start + len];
775       yMin = line->yMin;
776       yMax = line->yMax;
777       base = line->base;
778 
779     } else {
780 
781       blk = line->blk;
782       d0 = line->edge[start];
783       d1 = line->edge[start + len];
784       d2 = d3 = d4 = 0; // make gcc happy
785 
786       switch (line->rot) {
787       case 0:
788 	d2 = line->yMin;
789 	d3 = line->yMax;
790 	d4 = line->base;
791 	d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
792 	d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
793 	d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
794 	d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
795 	d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
796 	break;
797       case 1:
798 	d2 = line->xMax;
799 	d3 = line->xMin;
800 	d4 = line->base;
801 	d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
802 	d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
803 	d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
804 	d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
805 	d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
806 	break;
807       case 2:
808 	d2 = line->yMax;
809 	d3 = line->yMin;
810 	d4 = line->base;
811 	d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
812 	d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
813 	d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
814 	d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
815 	d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
816 	break;
817       case 3:
818 	d2 = line->xMin;
819 	d3 = line->xMax;
820 	d4 = line->base;
821 	d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
822 	d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
823 	d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
824 	d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
825 	d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
826 	break;
827       }
828 
829       switch (line->blk->page->primaryRot) {
830       case 0:
831 	xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
832 	xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
833 	yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
834 	yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
835 	base = blk->yMin + base * (blk->yMax - blk->yMin);
836 	break;
837       case 1:
838 	xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
839 	xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
840 	yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
841 	yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
842 	base = blk->xMax - d4 * (blk->xMax - blk->xMin);
843 	break;
844       case 2:
845 	xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
846 	xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
847 	yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
848 	yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
849 	base = blk->yMax - d4 * (blk->yMax - blk->yMin);
850 	break;
851       case 3:
852 	xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
853 	xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
854 	yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
855 	yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
856 	base = blk->xMin + d4 * (blk->xMax - blk->xMin);
857 	break;
858       }
859 
860     }
861   }
862 }
863 
cmpYXPrimaryRot(const void * p1,const void * p2)864 int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
865   TextLineFrag *frag1 = (TextLineFrag *)p1;
866   TextLineFrag *frag2 = (TextLineFrag *)p2;
867   double cmp;
868 
869   cmp = 0; // make gcc happy
870   switch (frag1->line->blk->page->primaryRot) {
871   case 0:
872     if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) {
873       cmp = frag1->xMin - frag2->xMin;
874     }
875     break;
876   case 1:
877     if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) {
878       cmp = frag1->yMin - frag2->yMin;
879     }
880     break;
881   case 2:
882     if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) {
883       cmp = frag2->xMax - frag1->xMax;
884     }
885     break;
886   case 3:
887     if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) {
888       cmp = frag2->yMax - frag1->yMax;
889     }
890     break;
891   }
892   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
893 }
894 
cmpYXLineRot(const void * p1,const void * p2)895 int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
896   TextLineFrag *frag1 = (TextLineFrag *)p1;
897   TextLineFrag *frag2 = (TextLineFrag *)p2;
898   double cmp;
899 
900   cmp = 0; // make gcc happy
901   switch (frag1->line->rot) {
902   case 0:
903     if ((cmp = frag1->yMin - frag2->yMin) == 0) {
904       cmp = frag1->xMin - frag2->xMin;
905     }
906     break;
907   case 1:
908     if ((cmp = frag2->xMax - frag1->xMax) == 0) {
909       cmp = frag1->yMin - frag2->yMin;
910     }
911     break;
912   case 2:
913     if ((cmp = frag2->yMin - frag1->yMin) == 0) {
914       cmp = frag2->xMax - frag1->xMax;
915     }
916     break;
917   case 3:
918     if ((cmp = frag1->xMax - frag2->xMax) == 0) {
919       cmp = frag2->yMax - frag1->yMax;
920     }
921     break;
922   }
923   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
924 }
925 
cmpXYLineRot(const void * p1,const void * p2)926 int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
927   TextLineFrag *frag1 = (TextLineFrag *)p1;
928   TextLineFrag *frag2 = (TextLineFrag *)p2;
929   double cmp;
930 
931   cmp = 0; // make gcc happy
932   switch (frag1->line->rot) {
933   case 0:
934     if ((cmp = frag1->xMin - frag2->xMin) == 0) {
935       cmp = frag1->yMin - frag2->yMin;
936     }
937     break;
938   case 1:
939     if ((cmp = frag1->yMin - frag2->yMin) == 0) {
940       cmp = frag2->xMax - frag1->xMax;
941     }
942     break;
943   case 2:
944     if ((cmp = frag2->xMax - frag1->xMax) == 0) {
945       cmp = frag2->yMin - frag1->yMin;
946     }
947     break;
948   case 3:
949     if ((cmp = frag2->yMax - frag1->yMax) == 0) {
950       cmp = frag1->xMax - frag2->xMax;
951     }
952     break;
953   }
954   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
955 }
956 
957 //------------------------------------------------------------------------
958 // TextBlock
959 //------------------------------------------------------------------------
960 
TextBlock(TextPage * pageA,int rotA)961 TextBlock::TextBlock(TextPage *pageA, int rotA) {
962   page = pageA;
963   rot = rotA;
964   xMin = yMin = 0;
965   xMax = yMax = -1;
966   priMin = 0;
967   priMax = page->pageWidth;
968   pool = new TextPool();
969   lines = NULL;
970   curLine = NULL;
971   next = NULL;
972   stackNext = NULL;
973 }
974 
~TextBlock()975 TextBlock::~TextBlock() {
976   TextLine *line;
977 
978   delete pool;
979   while (lines) {
980     line = lines;
981     lines = lines->next;
982     delete line;
983   }
984 }
985 
addWord(TextWord * word)986 void TextBlock::addWord(TextWord *word) {
987   pool->addWord(word);
988   if (xMin > xMax) {
989     xMin = word->xMin;
990     xMax = word->xMax;
991     yMin = word->yMin;
992     yMax = word->yMax;
993   } else {
994     if (word->xMin < xMin) {
995       xMin = word->xMin;
996     }
997     if (word->xMax > xMax) {
998       xMax = word->xMax;
999     }
1000     if (word->yMin < yMin) {
1001       yMin = word->yMin;
1002     }
1003     if (word->yMax > yMax) {
1004       yMax = word->yMax;
1005     }
1006   }
1007 }
1008 
coalesce(UnicodeMap * uMap)1009 void TextBlock::coalesce(UnicodeMap *uMap) {
1010   TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1011   TextLine *line, *line0, *line1;
1012   int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1013   int baseIdx, bestWordBaseIdx, idx0, idx1;
1014   double minBase, maxBase;
1015   double fontSize, delta, priDelta, secDelta;
1016   TextLine **lineArray;
1017   GBool found;
1018   int col1, col2;
1019   int i, j, k;
1020 
1021   // discard duplicated text (fake boldface, drop shadows)
1022   for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1023     word0 = pool->getPool(idx0);
1024     while (word0) {
1025       priDelta = dupMaxPriDelta * word0->fontSize;
1026       secDelta = dupMaxSecDelta * word0->fontSize;
1027       if (rot == 0 || rot == 3) {
1028 	maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1029       } else {
1030 	maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
1031       }
1032       found = gFalse;
1033       word1 = word2 = NULL; // make gcc happy
1034       for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1035 	if (idx1 == idx0) {
1036 	  word1 = word0;
1037 	  word2 = word0->next;
1038 	} else {
1039 	  word1 = NULL;
1040 	  word2 = pool->getPool(idx1);
1041 	}
1042 	for (; word2; word1 = word2, word2 = word2->next) {
1043 	  if (word2->len == word0->len &&
1044 	      !memcmp(word2->text, word0->text,
1045 		      word0->len * sizeof(Unicode))) {
1046 	    switch (rot) {
1047 	    case 0:
1048 	    case 2:
1049 	      found = fabs(word0->xMin - word2->xMin) < priDelta &&
1050 		      fabs(word0->xMax - word2->xMax) < priDelta &&
1051 		      fabs(word0->yMin - word2->yMin) < secDelta &&
1052 		      fabs(word0->yMax - word2->yMax) < secDelta;
1053 	      break;
1054 	    case 1:
1055 	    case 3:
1056 	      found = fabs(word0->xMin - word2->xMin) < secDelta &&
1057 		      fabs(word0->xMax - word2->xMax) < secDelta &&
1058 		      fabs(word0->yMin - word2->yMin) < priDelta &&
1059 		      fabs(word0->yMax - word2->yMax) < priDelta;
1060 	      break;
1061 	    }
1062 	  }
1063 	  if (found) {
1064 	    break;
1065 	  }
1066 	}
1067 	if (found) {
1068 	  break;
1069 	}
1070       }
1071       if (found) {
1072 	if (word1) {
1073 	  word1->next = word2->next;
1074 	} else {
1075 	  pool->setPool(idx1, word2->next);
1076 	}
1077 	delete word2;
1078       } else {
1079 	word0 = word0->next;
1080       }
1081     }
1082   }
1083 
1084   // build the lines
1085   curLine = NULL;
1086   poolMinBaseIdx = pool->minBaseIdx;
1087   charCount = 0;
1088   nLines = 0;
1089   while (1) {
1090 
1091     // find the first non-empty line in the pool
1092     for (;
1093 	 poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
1094 	 ++poolMinBaseIdx) ;
1095     if (poolMinBaseIdx > pool->maxBaseIdx) {
1096       break;
1097     }
1098 
1099     // look for the left-most word in the first four lines of the
1100     // pool -- this avoids starting with a superscript word
1101     startBaseIdx = poolMinBaseIdx;
1102     for (baseIdx = poolMinBaseIdx + 1;
1103 	 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
1104 	 ++baseIdx) {
1105       if (!pool->getPool(baseIdx)) {
1106 	continue;
1107       }
1108       if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
1109 	  < 0) {
1110 	startBaseIdx = baseIdx;
1111       }
1112     }
1113 
1114     // create a new line
1115     word0 = pool->getPool(startBaseIdx);
1116     pool->setPool(startBaseIdx, word0->next);
1117     word0->next = NULL;
1118     line = new TextLine(this, word0->rot, word0->base);
1119     line->addWord(word0);
1120     lastWord = word0;
1121 
1122     // compute the search range
1123     fontSize = word0->fontSize;
1124     minBase = word0->base - maxIntraLineDelta * fontSize;
1125     maxBase = word0->base + maxIntraLineDelta * fontSize;
1126     minBaseIdx = pool->getBaseIdx(minBase);
1127     maxBaseIdx = pool->getBaseIdx(maxBase);
1128 
1129     // find the rest of the words in this line
1130     while (1) {
1131 
1132       // find the left-most word whose baseline is in the range for
1133       // this line
1134       bestWordBaseIdx = 0;
1135       bestWord0 = bestWord1 = NULL;
1136       for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
1137 	for (word0 = NULL, word1 = pool->getPool(baseIdx);
1138 	     word1;
1139 	     word0 = word1, word1 = word1->next) {
1140 	  if (word1->base >= minBase &&
1141 	      word1->base <= maxBase &&
1142 	      (delta = lastWord->primaryDelta(word1)) >=
1143 	        minCharSpacing * fontSize) {
1144 	    if (delta < maxWordSpacing * fontSize &&
1145 		(!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1146 	      bestWordBaseIdx = baseIdx;
1147 	      bestWord0 = word0;
1148 	      bestWord1 = word1;
1149 	    }
1150 	    break;
1151 	  }
1152 	}
1153       }
1154       if (!bestWord1) {
1155 	break;
1156       }
1157 
1158       // remove it from the pool, and add it to the line
1159       if (bestWord0) {
1160 	bestWord0->next = bestWord1->next;
1161       } else {
1162 	pool->setPool(bestWordBaseIdx, bestWord1->next);
1163       }
1164       bestWord1->next = NULL;
1165       line->addWord(bestWord1);
1166       lastWord = bestWord1;
1167     }
1168 
1169     // add the line
1170     if (curLine && line->cmpYX(curLine) > 0) {
1171       line0 = curLine;
1172       line1 = curLine->next;
1173     } else {
1174       line0 = NULL;
1175       line1 = lines;
1176     }
1177     for (;
1178 	 line1 && line->cmpYX(line1) > 0;
1179 	 line0 = line1, line1 = line1->next) ;
1180     if (line0) {
1181       line0->next = line;
1182     } else {
1183       lines = line;
1184     }
1185     line->next = line1;
1186     curLine = line;
1187     line->coalesce(uMap);
1188     charCount += line->len;
1189     ++nLines;
1190   }
1191 
1192   // sort lines into xy order for column assignment
1193   lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *));
1194   for (line = lines, i = 0; line; line = line->next, ++i) {
1195     lineArray[i] = line;
1196   }
1197   qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1198 
1199   // column assignment
1200   nColumns = 0;
1201   for (i = 0; i < nLines; ++i) {
1202     line0 = lineArray[i];
1203     col1 = 0;
1204     for (j = 0; j < i; ++j) {
1205       line1 = lineArray[j];
1206       if (line1->primaryDelta(line0) >= 0) {
1207 	col2 = line1->col[line1->len] + 1;
1208       } else {
1209 	k = 0; // make gcc happy
1210 	switch (rot) {
1211 	case 0:
1212 	  for (k = 0;
1213 	       k < line1->len &&
1214 		 line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1215 	       ++k) ;
1216 	  break;
1217 	case 1:
1218 	  for (k = 0;
1219 	       k < line1->len &&
1220 		 line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1221 	       ++k) ;
1222 	  break;
1223 	case 2:
1224 	  for (k = 0;
1225 	       k < line1->len &&
1226 		 line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1227 	       ++k) ;
1228 	  break;
1229 	case 3:
1230 	  for (k = 0;
1231 	       k < line1->len &&
1232 		 line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1233 	       ++k) ;
1234 	  break;
1235 	}
1236 	col2 = line1->col[k];
1237       }
1238       if (col2 > col1) {
1239 	col1 = col2;
1240       }
1241     }
1242     for (k = 0; k <= line0->len; ++k) {
1243       line0->col[k] += col1;
1244     }
1245     if (line0->col[line0->len] > nColumns) {
1246       nColumns = line0->col[line0->len];
1247     }
1248   }
1249   gfree(lineArray);
1250 }
1251 
updatePriMinMax(TextBlock * blk)1252 void TextBlock::updatePriMinMax(TextBlock *blk) {
1253   double newPriMin, newPriMax;
1254   GBool gotPriMin, gotPriMax;
1255 
1256   gotPriMin = gotPriMax = gFalse;
1257   newPriMin = newPriMax = 0; // make gcc happy
1258   switch (page->primaryRot) {
1259   case 0:
1260   case 2:
1261     if (blk->yMin < yMax && blk->yMax > yMin) {
1262       if (blk->xMin < xMin) {
1263 	newPriMin = blk->xMax;
1264 	gotPriMin = gTrue;
1265       }
1266       if (blk->xMax > xMax) {
1267 	newPriMax = blk->xMin;
1268 	gotPriMax = gTrue;
1269       }
1270     }
1271     break;
1272   case 1:
1273   case 3:
1274     if (blk->xMin < xMax && blk->xMax > xMin) {
1275       if (blk->yMin < yMin) {
1276 	newPriMin = blk->yMax;
1277 	gotPriMin = gTrue;
1278       }
1279       if (blk->yMax > yMax) {
1280 	newPriMax = blk->yMin;
1281 	gotPriMax = gTrue;
1282       }
1283     }
1284     break;
1285   }
1286   if (gotPriMin) {
1287     if (newPriMin > xMin) {
1288       newPriMin = xMin;
1289     }
1290     if (newPriMin > priMin) {
1291       priMin = newPriMin;
1292     }
1293   }
1294   if (gotPriMax) {
1295     if (newPriMax < xMax) {
1296       newPriMax = xMax;
1297     }
1298     if (newPriMax < priMax) {
1299       priMax = newPriMax;
1300     }
1301   }
1302 }
1303 
cmpXYPrimaryRot(const void * p1,const void * p2)1304 int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
1305   TextBlock *blk1 = *(TextBlock **)p1;
1306   TextBlock *blk2 = *(TextBlock **)p2;
1307   double cmp;
1308 
1309   cmp = 0; // make gcc happy
1310   switch (blk1->page->primaryRot) {
1311   case 0:
1312     if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1313       cmp = blk1->yMin - blk2->yMin;
1314     }
1315     break;
1316   case 1:
1317     if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1318       cmp = blk2->xMax - blk1->xMax;
1319     }
1320     break;
1321   case 2:
1322     if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1323       cmp = blk2->yMin - blk1->yMin;
1324     }
1325     break;
1326   case 3:
1327     if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1328       cmp = blk1->xMax - blk2->xMax;
1329     }
1330     break;
1331   }
1332   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1333 }
1334 
cmpYXPrimaryRot(const void * p1,const void * p2)1335 int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
1336   TextBlock *blk1 = *(TextBlock **)p1;
1337   TextBlock *blk2 = *(TextBlock **)p2;
1338   double cmp;
1339 
1340   cmp = 0; // make gcc happy
1341   switch (blk1->page->primaryRot) {
1342   case 0:
1343     if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1344       cmp = blk1->xMin - blk2->xMin;
1345     }
1346     break;
1347   case 1:
1348     if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1349       cmp = blk1->yMin - blk2->yMin;
1350     }
1351     break;
1352   case 2:
1353     if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1354       cmp = blk2->xMax - blk1->xMax;
1355     }
1356     break;
1357   case 3:
1358     if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1359       cmp = blk2->yMax - blk1->yMax;
1360     }
1361     break;
1362   }
1363   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1364 }
1365 
primaryCmp(TextBlock * blk)1366 int TextBlock::primaryCmp(TextBlock *blk) {
1367   double cmp;
1368 
1369   cmp = 0; // make gcc happy
1370   switch (rot) {
1371   case 0:
1372     cmp = xMin - blk->xMin;
1373     break;
1374   case 1:
1375     cmp = yMin - blk->yMin;
1376     break;
1377   case 2:
1378     cmp = blk->xMax - xMax;
1379     break;
1380   case 3:
1381     cmp = blk->yMax - yMax;
1382     break;
1383   }
1384   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1385 }
1386 
secondaryDelta(TextBlock * blk)1387 double TextBlock::secondaryDelta(TextBlock *blk) {
1388   double delta;
1389 
1390   delta = 0; // make gcc happy
1391   switch (rot) {
1392   case 0:
1393     delta = blk->yMin - yMax;
1394     break;
1395   case 1:
1396     delta = xMin - blk->xMax;
1397     break;
1398   case 2:
1399     delta = yMin - blk->yMax;
1400     break;
1401   case 3:
1402     delta = blk->xMin - xMax;
1403     break;
1404   }
1405   return delta;
1406 }
1407 
isBelow(TextBlock * blk)1408 GBool TextBlock::isBelow(TextBlock *blk) {
1409   GBool below;
1410 
1411   below = gFalse; // make gcc happy
1412   switch (page->primaryRot) {
1413   case 0:
1414     below = xMin >= blk->priMin && xMax <= blk->priMax &&
1415             yMin > blk->yMin;
1416     break;
1417   case 1:
1418     below = yMin >= blk->priMin && yMax <= blk->priMax &&
1419             xMax < blk->xMax;
1420     break;
1421   case 2:
1422     below = xMin >= blk->priMin && xMax <= blk->priMax &&
1423             yMax < blk->yMax;
1424     break;
1425   case 3:
1426     below = yMin >= blk->priMin && yMax <= blk->priMax &&
1427             xMin > blk->xMin;
1428     break;
1429   }
1430 
1431   return below;
1432 }
1433 
1434 //------------------------------------------------------------------------
1435 // TextFlow
1436 //------------------------------------------------------------------------
1437 
TextFlow(TextPage * pageA,TextBlock * blk)1438 TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
1439   page = pageA;
1440   xMin = blk->xMin;
1441   xMax = blk->xMax;
1442   yMin = blk->yMin;
1443   yMax = blk->yMax;
1444   priMin = blk->priMin;
1445   priMax = blk->priMax;
1446   blocks = lastBlk = blk;
1447   next = NULL;
1448 }
1449 
~TextFlow()1450 TextFlow::~TextFlow() {
1451   TextBlock *blk;
1452 
1453   while (blocks) {
1454     blk = blocks;
1455     blocks = blocks->next;
1456     delete blk;
1457   }
1458 }
1459 
addBlock(TextBlock * blk)1460 void TextFlow::addBlock(TextBlock *blk) {
1461   if (lastBlk) {
1462     lastBlk->next = blk;
1463   } else {
1464     blocks = blk;
1465   }
1466   lastBlk = blk;
1467   if (blk->xMin < xMin) {
1468     xMin = blk->xMin;
1469   }
1470   if (blk->xMax > xMax) {
1471     xMax = blk->xMax;
1472   }
1473   if (blk->yMin < yMin) {
1474     yMin = blk->yMin;
1475   }
1476   if (blk->yMax > yMax) {
1477     yMax = blk->yMax;
1478   }
1479 }
1480 
blockFits(TextBlock * blk,TextBlock * prevBlk)1481 GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
1482   GBool fits;
1483 
1484   // lower blocks must use smaller fonts
1485   if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
1486     return gFalse;
1487   }
1488 
1489   fits = gFalse; // make gcc happy
1490   switch (page->primaryRot) {
1491   case 0:
1492     fits = blk->xMin >= priMin && blk->xMax <= priMax;
1493     break;
1494   case 1:
1495     fits = blk->yMin >= priMin && blk->yMax <= priMax;
1496     break;
1497   case 2:
1498     fits = blk->xMin >= priMin && blk->xMax <= priMax;
1499     break;
1500   case 3:
1501     fits = blk->yMin >= priMin && blk->yMax <= priMax;
1502     break;
1503   }
1504   return fits;
1505 }
1506 
1507 #if TEXTOUT_WORD_LIST
1508 
1509 //------------------------------------------------------------------------
1510 // TextWordList
1511 //------------------------------------------------------------------------
1512 
TextWordList(TextPage * text,GBool physLayout)1513 TextWordList::TextWordList(TextPage *text, GBool physLayout) {
1514   TextFlow *flow;
1515   TextBlock *blk;
1516   TextLine *line;
1517   TextWord *word;
1518   TextWord **wordArray;
1519   int nWords, i;
1520 
1521   words = new GList();
1522 
1523   if (text->rawOrder) {
1524     for (word = text->rawWords; word; word = word->next) {
1525       words->append(word);
1526     }
1527 
1528   } else if (physLayout) {
1529     // this is inefficient, but it's also the least useful of these
1530     // three cases
1531     nWords = 0;
1532     for (flow = text->flows; flow; flow = flow->next) {
1533       for (blk = flow->blocks; blk; blk = blk->next) {
1534 	for (line = blk->lines; line; line = line->next) {
1535 	  for (word = line->words; word; word = word->next) {
1536 	    ++nWords;
1537 	  }
1538 	}
1539       }
1540     }
1541     wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *));
1542     i = 0;
1543     for (flow = text->flows; flow; flow = flow->next) {
1544       for (blk = flow->blocks; blk; blk = blk->next) {
1545 	for (line = blk->lines; line; line = line->next) {
1546 	  for (word = line->words; word; word = word->next) {
1547 	    wordArray[i++] = word;
1548 	  }
1549 	}
1550       }
1551     }
1552     qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
1553     for (i = 0; i < nWords; ++i) {
1554       words->append(wordArray[i]);
1555     }
1556     gfree(wordArray);
1557 
1558   } else {
1559     for (flow = text->flows; flow; flow = flow->next) {
1560       for (blk = flow->blocks; blk; blk = blk->next) {
1561 	for (line = blk->lines; line; line = line->next) {
1562 	  for (word = line->words; word; word = word->next) {
1563 	    words->append(word);
1564 	  }
1565 	}
1566       }
1567     }
1568   }
1569 }
1570 
~TextWordList()1571 TextWordList::~TextWordList() {
1572   delete words;
1573 }
1574 
getLength()1575 int TextWordList::getLength() {
1576   return words->getLength();
1577 }
1578 
get(int idx)1579 TextWord *TextWordList::get(int idx) {
1580   if (idx < 0 || idx >= words->getLength()) {
1581     return NULL;
1582   }
1583   return (TextWord *)words->get(idx);
1584 }
1585 
1586 #endif // TEXTOUT_WORD_LIST
1587 
1588 //------------------------------------------------------------------------
1589 // TextPage
1590 //------------------------------------------------------------------------
1591 
TextPage(GBool rawOrderA)1592 TextPage::TextPage(GBool rawOrderA) {
1593   int rot;
1594 
1595   rawOrder = rawOrderA;
1596   curWord = NULL;
1597   charPos = 0;
1598   curFont = NULL;
1599   curFontSize = 0;
1600   nest = 0;
1601   nTinyChars = 0;
1602   lastCharOverlap = gFalse;
1603   if (!rawOrder) {
1604     for (rot = 0; rot < 4; ++rot) {
1605       pools[rot] = new TextPool();
1606     }
1607   }
1608   flows = NULL;
1609   blocks = NULL;
1610   rawWords = NULL;
1611   rawLastWord = NULL;
1612   fonts = new GList();
1613   lastFindXMin = lastFindYMin = 0;
1614   haveLastFind = gFalse;
1615 }
1616 
~TextPage()1617 TextPage::~TextPage() {
1618   int rot;
1619 
1620   clear();
1621   if (!rawOrder) {
1622     for (rot = 0; rot < 4; ++rot) {
1623       delete pools[rot];
1624     }
1625   }
1626   delete fonts;
1627 }
1628 
startPage(GfxState * state)1629 void TextPage::startPage(GfxState *state) {
1630   clear();
1631   if (state) {
1632     pageWidth = state->getPageWidth();
1633     pageHeight = state->getPageHeight();
1634   } else {
1635     pageWidth = pageHeight = 0;
1636   }
1637 }
1638 
endPage()1639 void TextPage::endPage() {
1640   if (curWord) {
1641     endWord();
1642   }
1643 }
1644 
clear()1645 void TextPage::clear() {
1646   int rot;
1647   TextFlow *flow;
1648   TextWord *word;
1649 
1650   if (curWord) {
1651     delete curWord;
1652     curWord = NULL;
1653   }
1654   if (rawOrder) {
1655     while (rawWords) {
1656       word = rawWords;
1657       rawWords = rawWords->next;
1658       delete word;
1659     }
1660   } else {
1661     for (rot = 0; rot < 4; ++rot) {
1662       delete pools[rot];
1663     }
1664     while (flows) {
1665       flow = flows;
1666       flows = flows->next;
1667       delete flow;
1668     }
1669     gfree(blocks);
1670   }
1671   deleteGList(fonts, TextFontInfo);
1672 
1673   curWord = NULL;
1674   charPos = 0;
1675   curFont = NULL;
1676   curFontSize = 0;
1677   nest = 0;
1678   nTinyChars = 0;
1679   if (!rawOrder) {
1680     for (rot = 0; rot < 4; ++rot) {
1681       pools[rot] = new TextPool();
1682     }
1683   }
1684   flows = NULL;
1685   blocks = NULL;
1686   rawWords = NULL;
1687   rawLastWord = NULL;
1688   fonts = new GList();
1689 }
1690 
updateFont(GfxState * state)1691 void TextPage::updateFont(GfxState *state) {
1692   GfxFont *gfxFont;
1693   double *fm;
1694   char *name;
1695   int code, mCode, letterCode, anyCode;
1696   double w;
1697   int i;
1698 
1699   // get the font info object
1700   curFont = NULL;
1701   for (i = 0; i < fonts->getLength(); ++i) {
1702     curFont = (TextFontInfo *)fonts->get(i);
1703     if (curFont->matches(state)) {
1704       break;
1705     }
1706     curFont = NULL;
1707   }
1708   if (!curFont) {
1709     curFont = new TextFontInfo(state);
1710     fonts->append(curFont);
1711   }
1712 
1713   // adjust the font size
1714   gfxFont = state->getFont();
1715   curFontSize = state->getTransformedFontSize();
1716   if (gfxFont && gfxFont->getType() == fontType3) {
1717     // This is a hack which makes it possible to deal with some Type 3
1718     // fonts.  The problem is that it's impossible to know what the
1719     // base coordinate system used in the font is without actually
1720     // rendering the font.  This code tries to guess by looking at the
1721     // width of the character 'm' (which breaks if the font is a
1722     // subset that doesn't contain 'm').
1723     mCode = letterCode = anyCode = -1;
1724     for (code = 0; code < 256; ++code) {
1725       name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
1726       if (name && name[0] == 'm' && name[1] == '\0') {
1727 	mCode = code;
1728       }
1729       if (letterCode < 0 && name && name[1] == '\0' &&
1730 	  ((name[0] >= 'A' && name[0] <= 'Z') ||
1731 	   (name[0] >= 'a' && name[0] <= 'z'))) {
1732 	letterCode = code;
1733       }
1734       if (anyCode < 0 && name &&
1735 	  ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
1736 	anyCode = code;
1737       }
1738     }
1739     if (mCode >= 0 &&
1740 	(w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
1741       // 0.6 is a generic average 'm' width -- yes, this is a hack
1742       curFontSize *= w / 0.6;
1743     } else if (letterCode >= 0 &&
1744 	       (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
1745       // even more of a hack: 0.5 is a generic letter width
1746       curFontSize *= w / 0.5;
1747     } else if (anyCode >= 0 &&
1748 	       (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
1749       // better than nothing: 0.5 is a generic character width
1750       curFontSize *= w / 0.5;
1751     }
1752     fm = gfxFont->getFontMatrix();
1753     if (fm[0] != 0) {
1754       curFontSize *= fabs(fm[3] / fm[0]);
1755     }
1756   }
1757 }
1758 
beginWord(GfxState * state,double x0,double y0)1759 void TextPage::beginWord(GfxState *state, double x0, double y0) {
1760   double *fontm;
1761   double m[4], m2[4];
1762   int rot;
1763 
1764   // This check is needed because Type 3 characters can contain
1765   // text-drawing operations (when TextPage is being used via
1766   // {X,Win}SplashOutputDev rather than TextOutputDev).
1767   if (curWord) {
1768     ++nest;
1769     return;
1770   }
1771 
1772   // compute the rotation
1773   state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
1774   if (state->getFont()->getType() == fontType3) {
1775     fontm = state->getFont()->getFontMatrix();
1776     m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
1777     m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
1778     m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
1779     m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
1780     m[0] = m2[0];
1781     m[1] = m2[1];
1782     m[2] = m2[2];
1783     m[3] = m2[3];
1784   }
1785   if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
1786     rot = (m[3] < 0) ? 0 : 2;
1787   } else {
1788     rot = (m[2] > 0) ? 1 : 3;
1789   }
1790 
1791   curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize);
1792 }
1793 
addChar(GfxState * state,double x,double y,double dx,double dy,CharCode c,int nBytes,Unicode * u,int uLen)1794 void TextPage::addChar(GfxState *state, double x, double y,
1795 		       double dx, double dy,
1796 		       CharCode c, int nBytes, Unicode *u, int uLen) {
1797   double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
1798   GBool overlap;
1799   int i;
1800 
1801   // throw away chars that aren't inside the page bounds
1802   state->transform(x, y, &x1, &y1);
1803   if (x1 < 0 || x1 > pageWidth ||
1804       y1 < 0 || y1 > pageHeight) {
1805     charPos += nBytes;
1806     return;
1807   }
1808 
1809   // subtract char and word spacing from the dx,dy values
1810   sp = state->getCharSpace();
1811   if (c == (CharCode)0x20) {
1812     sp += state->getWordSpace();
1813   }
1814   state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
1815   dx -= dx2;
1816   dy -= dy2;
1817   state->transformDelta(dx, dy, &w1, &h1);
1818 
1819   // check the tiny chars limit
1820   if (!globalParams->getTextKeepTinyChars() &&
1821       fabs(w1) < 3 && fabs(h1) < 3) {
1822     if (++nTinyChars > 50000) {
1823       charPos += nBytes;
1824       return;
1825     }
1826   }
1827 
1828   // break words at space character
1829   if (uLen == 1 && u[0] == (Unicode)0x20) {
1830     if (curWord) {
1831       ++curWord->charLen;
1832     }
1833     charPos += nBytes;
1834     endWord();
1835     return;
1836   }
1837 
1838   // start a new word if:
1839   // (1) this character doesn't fall in the right place relative to
1840   //     the end of the previous word (this places upper and lower
1841   //     constraints on the position deltas along both the primary
1842   //     and secondary axes), or
1843   // (2) this character overlaps the previous one (duplicated text), or
1844   // (3) the previous character was an overlap (we want each duplicated
1845   //     character to be in a word by itself at this stage)
1846   if (curWord && curWord->len > 0) {
1847     base = sp = delta = 0; // make gcc happy
1848     switch (curWord->rot) {
1849     case 0:
1850       base = y1;
1851       sp = x1 - curWord->xMax;
1852       delta = x1 - curWord->edge[curWord->len - 1];
1853       break;
1854     case 1:
1855       base = x1;
1856       sp = y1 - curWord->yMax;
1857       delta = y1 - curWord->edge[curWord->len - 1];
1858       break;
1859     case 2:
1860       base = y1;
1861       sp = curWord->xMin - x1;
1862       delta = curWord->edge[curWord->len - 1] - x1;
1863       break;
1864     case 3:
1865       base = x1;
1866       sp = curWord->yMin - y1;
1867       delta = curWord->edge[curWord->len - 1] - y1;
1868       break;
1869     }
1870     overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&
1871               fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
1872     if (overlap || lastCharOverlap ||
1873 	sp < -minDupBreakOverlap * curWord->fontSize ||
1874 	sp > minWordBreakSpace * curWord->fontSize ||
1875 	fabs(base - curWord->base) > 0.5) {
1876       endWord();
1877     }
1878     lastCharOverlap = overlap;
1879   } else {
1880     lastCharOverlap = gFalse;
1881   }
1882 
1883   if (uLen != 0) {
1884     // start a new word if needed
1885     if (!curWord) {
1886       beginWord(state, x, y);
1887     }
1888 
1889     // page rotation and/or transform matrices can cause text to be
1890     // drawn in reverse order -- in this case, swap the begin/end
1891     // coordinates and break text into individual chars
1892     if ((curWord->rot == 0 && w1 < 0) ||
1893 	(curWord->rot == 1 && h1 < 0) ||
1894 	(curWord->rot == 2 && w1 > 0) ||
1895 	(curWord->rot == 3 && h1 > 0)) {
1896       endWord();
1897       beginWord(state, x + dx, y + dy);
1898       x1 += w1;
1899       y1 += h1;
1900       w1 = -w1;
1901       h1 = -h1;
1902     }
1903 
1904     // add the characters to the current word
1905     w1 /= uLen;
1906     h1 /= uLen;
1907     for (i = 0; i < uLen; ++i) {
1908       curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
1909     }
1910   }
1911   if (curWord) {
1912     curWord->charLen += nBytes;
1913   }
1914   charPos += nBytes;
1915 }
1916 
endWord()1917 void TextPage::endWord() {
1918   // This check is needed because Type 3 characters can contain
1919   // text-drawing operations (when TextPage is being used via
1920   // {X,Win}SplashOutputDev rather than TextOutputDev).
1921   if (nest > 0) {
1922     --nest;
1923     return;
1924   }
1925 
1926   if (curWord) {
1927     addWord(curWord);
1928     curWord = NULL;
1929   }
1930 }
1931 
addWord(TextWord * word)1932 void TextPage::addWord(TextWord *word) {
1933   // throw away zero-length words -- they don't have valid xMin/xMax
1934   // values, and they're useless anyway
1935   if (word->len == 0) {
1936     delete word;
1937     return;
1938   }
1939 
1940   if (rawOrder) {
1941     if (rawLastWord) {
1942       rawLastWord->next = word;
1943     } else {
1944       rawWords = word;
1945     }
1946     rawLastWord = word;
1947   } else {
1948     pools[word->rot]->addWord(word);
1949   }
1950 }
1951 
coalesce(GBool physLayout)1952 void TextPage::coalesce(GBool physLayout) {
1953   UnicodeMap *uMap;
1954   TextPool *pool;
1955   TextWord *word0, *word1, *word2;
1956   TextLine *line;
1957   TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1;
1958   TextBlock **blkArray;
1959   TextFlow *flow, *lastFlow;
1960   int rot, poolMinBaseIdx, baseIdx, startBaseIdx;
1961   double minBase, maxBase, newMinBase, newMaxBase;
1962   double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
1963   GBool found;
1964   int count[4];
1965   int lrCount;
1966   int firstBlkIdx, nBlocksLeft;
1967   int col1, col2;
1968   int i, j, n;
1969 
1970   if (rawOrder) {
1971     primaryRot = 0;
1972     primaryLR = gTrue;
1973     return;
1974   }
1975 
1976   uMap = globalParams->getTextEncoding();
1977   blkList = NULL;
1978   lastBlk = NULL;
1979   nBlocks = 0;
1980   primaryRot = -1;
1981 
1982 #if 0 // for debugging
1983   printf("*** initial words ***\n");
1984   for (rot = 0; rot < 4; ++rot) {
1985     pool = pools[rot];
1986     for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
1987       for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
1988 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d '",
1989 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
1990 	       word0->base, word0->fontSize, rot*90);
1991 	for (i = 0; i < word0->len; ++i) {
1992 	  fputc(word0->text[i] & 0xff, stdout);
1993 	}
1994 	printf("'\n");
1995       }
1996     }
1997   }
1998   printf("\n");
1999 #endif
2000 
2001   //----- assemble the blocks
2002 
2003   //~ add an outer loop for writing mode (vertical text)
2004 
2005   // build blocks for each rotation value
2006   for (rot = 0; rot < 4; ++rot) {
2007     pool = pools[rot];
2008     poolMinBaseIdx = pool->minBaseIdx;
2009     count[rot] = 0;
2010 
2011     // add blocks until no more words are left
2012     while (1) {
2013 
2014       // find the first non-empty line in the pool
2015       for (;
2016 	   poolMinBaseIdx <= pool->maxBaseIdx &&
2017 	     !pool->getPool(poolMinBaseIdx);
2018 	   ++poolMinBaseIdx) ;
2019       if (poolMinBaseIdx > pool->maxBaseIdx) {
2020 	break;
2021       }
2022 
2023       // look for the left-most word in the first four lines of the
2024       // pool -- this avoids starting with a superscript word
2025       startBaseIdx = poolMinBaseIdx;
2026       for (baseIdx = poolMinBaseIdx + 1;
2027 	   baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
2028 	   ++baseIdx) {
2029 	if (!pool->getPool(baseIdx)) {
2030 	  continue;
2031 	}
2032 	if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
2033 	    < 0) {
2034 	  startBaseIdx = baseIdx;
2035 	}
2036       }
2037 
2038       // create a new block
2039       word0 = pool->getPool(startBaseIdx);
2040       pool->setPool(startBaseIdx, word0->next);
2041       word0->next = NULL;
2042       blk = new TextBlock(this, rot);
2043       blk->addWord(word0);
2044 
2045       fontSize = word0->fontSize;
2046       minBase = maxBase = word0->base;
2047       colSpace1 = minColSpacing1 * fontSize;
2048       colSpace2 = minColSpacing2 * fontSize;
2049       lineSpace = maxLineSpacingDelta * fontSize;
2050       intraLineSpace = maxIntraLineDelta * fontSize;
2051 
2052       // add words to the block
2053       do {
2054 	found = gFalse;
2055 
2056 	// look for words on the line above the current top edge of
2057 	// the block
2058 	newMinBase = minBase;
2059 	for (baseIdx = pool->getBaseIdx(minBase);
2060 	     baseIdx >= pool->getBaseIdx(minBase - lineSpace);
2061 	     --baseIdx) {
2062 	  word0 = NULL;
2063 	  word1 = pool->getPool(baseIdx);
2064 	  while (word1) {
2065 	    if (word1->base < minBase &&
2066 		word1->base >= minBase - lineSpace &&
2067 		((rot == 0 || rot == 2)
2068 		 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2069 		 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2070 		fabs(word1->fontSize - fontSize) <
2071 		  maxBlockFontSizeDelta1 * fontSize) {
2072 	      word2 = word1;
2073 	      if (word0) {
2074 		word0->next = word1->next;
2075 	      } else {
2076 		pool->setPool(baseIdx, word1->next);
2077 	      }
2078 	      word1 = word1->next;
2079 	      word2->next = NULL;
2080 	      blk->addWord(word2);
2081 	      found = gTrue;
2082 	      newMinBase = word2->base;
2083 	    } else {
2084 	      word0 = word1;
2085 	      word1 = word1->next;
2086 	    }
2087 	  }
2088 	}
2089 	minBase = newMinBase;
2090 
2091 	// look for words on the line below the current bottom edge of
2092 	// the block
2093 	newMaxBase = maxBase;
2094 	for (baseIdx = pool->getBaseIdx(maxBase);
2095 	     baseIdx <= pool->getBaseIdx(maxBase + lineSpace);
2096 	     ++baseIdx) {
2097 	  word0 = NULL;
2098 	  word1 = pool->getPool(baseIdx);
2099 	  while (word1) {
2100 	    if (word1->base > maxBase &&
2101 		word1->base <= maxBase + lineSpace &&
2102 		((rot == 0 || rot == 2)
2103 		 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2104 		 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2105 		fabs(word1->fontSize - fontSize) <
2106 		  maxBlockFontSizeDelta1 * fontSize) {
2107 	      word2 = word1;
2108 	      if (word0) {
2109 		word0->next = word1->next;
2110 	      } else {
2111 		pool->setPool(baseIdx, word1->next);
2112 	      }
2113 	      word1 = word1->next;
2114 	      word2->next = NULL;
2115 	      blk->addWord(word2);
2116 	      found = gTrue;
2117 	      newMaxBase = word2->base;
2118 	    } else {
2119 	      word0 = word1;
2120 	      word1 = word1->next;
2121 	    }
2122 	  }
2123 	}
2124 	maxBase = newMaxBase;
2125 
2126 	// look for words that are on lines already in the block, and
2127 	// that overlap the block horizontally
2128 	for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2129 	     baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2130 	     ++baseIdx) {
2131 	  word0 = NULL;
2132 	  word1 = pool->getPool(baseIdx);
2133 	  while (word1) {
2134 	    if (word1->base >= minBase - intraLineSpace &&
2135 		word1->base <= maxBase + intraLineSpace &&
2136 		((rot == 0 || rot == 2)
2137 		 ? (word1->xMin < blk->xMax + colSpace1 &&
2138 		    word1->xMax > blk->xMin - colSpace1)
2139 		 : (word1->yMin < blk->yMax + colSpace1 &&
2140 		    word1->yMax > blk->yMin - colSpace1)) &&
2141 		fabs(word1->fontSize - fontSize) <
2142 		  maxBlockFontSizeDelta2 * fontSize) {
2143 	      word2 = word1;
2144 	      if (word0) {
2145 		word0->next = word1->next;
2146 	      } else {
2147 		pool->setPool(baseIdx, word1->next);
2148 	      }
2149 	      word1 = word1->next;
2150 	      word2->next = NULL;
2151 	      blk->addWord(word2);
2152 	      found = gTrue;
2153 	    } else {
2154 	      word0 = word1;
2155 	      word1 = word1->next;
2156 	    }
2157 	  }
2158 	}
2159 
2160 	// only check for outlying words (the next two chunks of code)
2161 	// if we didn't find anything else
2162 	if (found) {
2163 	  continue;
2164 	}
2165 
2166 	// scan down the left side of the block, looking for words
2167 	// that are near (but not overlapping) the block; if there are
2168 	// three or fewer, add them to the block
2169 	n = 0;
2170 	for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2171 	     baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2172 	     ++baseIdx) {
2173 	  word1 = pool->getPool(baseIdx);
2174 	  while (word1) {
2175 	    if (word1->base >= minBase - intraLineSpace &&
2176 		word1->base <= maxBase + intraLineSpace &&
2177 		((rot == 0 || rot == 2)
2178 		 ? (word1->xMax <= blk->xMin &&
2179 		    word1->xMax > blk->xMin - colSpace2)
2180 		 : (word1->yMax <= blk->yMin &&
2181 		    word1->yMax > blk->yMin - colSpace2)) &&
2182 		fabs(word1->fontSize - fontSize) <
2183 		  maxBlockFontSizeDelta3 * fontSize) {
2184 	      ++n;
2185 	      break;
2186 	    }
2187 	    word1 = word1->next;
2188 	  }
2189 	}
2190 	if (n > 0 && n <= 3) {
2191 	  for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2192 	       baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2193 	       ++baseIdx) {
2194 	    word0 = NULL;
2195 	    word1 = pool->getPool(baseIdx);
2196 	    while (word1) {
2197 	      if (word1->base >= minBase - intraLineSpace &&
2198 		  word1->base <= maxBase + intraLineSpace &&
2199 		  ((rot == 0 || rot == 2)
2200 		   ? (word1->xMax <= blk->xMin &&
2201 		      word1->xMax > blk->xMin - colSpace2)
2202 		   : (word1->yMax <= blk->yMin &&
2203 		      word1->yMax > blk->yMin - colSpace2)) &&
2204 		  fabs(word1->fontSize - fontSize) <
2205 		    maxBlockFontSizeDelta3 * fontSize) {
2206 		word2 = word1;
2207 		if (word0) {
2208 		  word0->next = word1->next;
2209 		} else {
2210 		  pool->setPool(baseIdx, word1->next);
2211 		}
2212 		word1 = word1->next;
2213 		word2->next = NULL;
2214 		blk->addWord(word2);
2215 		if (word2->base < minBase) {
2216 		  minBase = word2->base;
2217 		} else if (word2->base > maxBase) {
2218 		  maxBase = word2->base;
2219 		}
2220 		found = gTrue;
2221 		break;
2222 	      } else {
2223 		word0 = word1;
2224 		word1 = word1->next;
2225 	      }
2226 	    }
2227 	  }
2228 	}
2229 
2230 	// scan down the right side of the block, looking for words
2231 	// that are near (but not overlapping) the block; if there are
2232 	// three or fewer, add them to the block
2233 	n = 0;
2234 	for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2235 	     baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2236 	     ++baseIdx) {
2237 	  word1 = pool->getPool(baseIdx);
2238 	  while (word1) {
2239 	    if (word1->base >= minBase - intraLineSpace &&
2240 		word1->base <= maxBase + intraLineSpace &&
2241 		((rot == 0 || rot == 2)
2242 		 ? (word1->xMin >= blk->xMax &&
2243 		    word1->xMin < blk->xMax + colSpace2)
2244 		 : (word1->yMin >= blk->yMax &&
2245 		    word1->yMin < blk->yMax + colSpace2)) &&
2246 		fabs(word1->fontSize - fontSize) <
2247 		  maxBlockFontSizeDelta3 * fontSize) {
2248 	      ++n;
2249 	      break;
2250 	    }
2251 	    word1 = word1->next;
2252 	  }
2253 	}
2254 	if (n > 0 && n <= 3) {
2255 	  for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2256 	       baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2257 	       ++baseIdx) {
2258 	    word0 = NULL;
2259 	    word1 = pool->getPool(baseIdx);
2260 	    while (word1) {
2261 	      if (word1->base >= minBase - intraLineSpace &&
2262 		  word1->base <= maxBase + intraLineSpace &&
2263 		  ((rot == 0 || rot == 2)
2264 		   ? (word1->xMin >= blk->xMax &&
2265 		      word1->xMin < blk->xMax + colSpace2)
2266 		   : (word1->yMin >= blk->yMax &&
2267 		      word1->yMin < blk->yMax + colSpace2)) &&
2268 		  fabs(word1->fontSize - fontSize) <
2269 		    maxBlockFontSizeDelta3 * fontSize) {
2270 		word2 = word1;
2271 		if (word0) {
2272 		  word0->next = word1->next;
2273 		} else {
2274 		  pool->setPool(baseIdx, word1->next);
2275 		}
2276 		word1 = word1->next;
2277 		word2->next = NULL;
2278 		blk->addWord(word2);
2279 		if (word2->base < minBase) {
2280 		  minBase = word2->base;
2281 		} else if (word2->base > maxBase) {
2282 		  maxBase = word2->base;
2283 		}
2284 		found = gTrue;
2285 		break;
2286 	      } else {
2287 		word0 = word1;
2288 		word1 = word1->next;
2289 	      }
2290 	    }
2291 	  }
2292 	}
2293 
2294       } while (found);
2295 
2296       //~ need to compute the primary writing mode (horiz/vert) in
2297       //~ addition to primary rotation
2298 
2299       // coalesce the block, and add it to the list
2300       blk->coalesce(uMap);
2301       if (lastBlk) {
2302 	lastBlk->next = blk;
2303       } else {
2304 	blkList = blk;
2305       }
2306       lastBlk = blk;
2307       count[rot] += blk->charCount;
2308       if (primaryRot < 0 || count[rot] > count[primaryRot]) {
2309 	primaryRot = rot;
2310       }
2311       ++nBlocks;
2312     }
2313   }
2314 
2315 #if 0 // for debugging
2316   printf("*** rotation ***\n");
2317   for (rot = 0; rot < 4; ++rot) {
2318     printf("  %d: %6d\n", rot, count[rot]);
2319   }
2320   printf("  primary rot = %d\n", primaryRot);
2321   printf("\n");
2322 #endif
2323 
2324 #if 0 // for debugging
2325   printf("*** blocks ***\n");
2326   for (blk = blkList; blk; blk = blk->next) {
2327     printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
2328 	   blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
2329     for (line = blk->lines; line; line = line->next) {
2330       printf("  line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
2331 	     line->xMin, line->xMax, line->yMin, line->yMax, line->base);
2332       for (word0 = line->words; word0; word0 = word0->next) {
2333 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2334 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2335 	       word0->base, word0->fontSize, word0->spaceAfter);
2336 	for (i = 0; i < word0->len; ++i) {
2337 	  fputc(word0->text[i] & 0xff, stdout);
2338 	}
2339 	printf("'\n");
2340       }
2341     }
2342   }
2343   printf("\n");
2344 #endif
2345 
2346   // determine the primary direction
2347   lrCount = 0;
2348   for (blk = blkList; blk; blk = blk->next) {
2349     for (line = blk->lines; line; line = line->next) {
2350       for (word0 = line->words; word0; word0 = word0->next) {
2351 	for (i = 0; i < word0->len; ++i) {
2352 	  if (unicodeTypeL(word0->text[i])) {
2353 	    ++lrCount;
2354 	  } else if (unicodeTypeR(word0->text[i])) {
2355 	    --lrCount;
2356 	  }
2357 	}
2358       }
2359     }
2360   }
2361   primaryLR = lrCount >= 0;
2362 
2363 #if 0 // for debugging
2364   printf("*** direction ***\n");
2365   printf("lrCount = %d\n", lrCount);
2366   printf("primaryLR = %d\n", primaryLR);
2367 #endif
2368 
2369   //----- column assignment
2370 
2371   // sort blocks into xy order for column assignment
2372   blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
2373   for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
2374     blocks[i] = blk;
2375   }
2376   qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
2377 
2378   // column assignment
2379   for (i = 0; i < nBlocks; ++i) {
2380     blk0 = blocks[i];
2381     col1 = 0;
2382     for (j = 0; j < i; ++j) {
2383       blk1 = blocks[j];
2384       col2 = 0; // make gcc happy
2385       switch (primaryRot) {
2386       case 0:
2387 	if (blk0->xMin > blk1->xMax) {
2388 	  col2 = blk1->col + blk1->nColumns + 3;
2389 	} else if (blk1->xMax == blk1->xMin) {
2390 	  col2 = blk1->col;
2391 	} else {
2392 	  col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
2393 				    (blk1->xMax - blk1->xMin)) *
2394 				   blk1->nColumns);
2395 	}
2396 	break;
2397       case 1:
2398 	if (blk0->yMin > blk1->yMax) {
2399 	  col2 = blk1->col + blk1->nColumns + 3;
2400 	} else if (blk1->yMax == blk1->yMin) {
2401 	  col2 = blk1->col;
2402 	} else {
2403 	  col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
2404 				    (blk1->yMax - blk1->yMin)) *
2405 				   blk1->nColumns);
2406 	}
2407 	break;
2408       case 2:
2409 	if (blk0->xMax < blk1->xMin) {
2410 	  col2 = blk1->col + blk1->nColumns + 3;
2411 	} else if (blk1->xMin == blk1->xMax) {
2412 	  col2 = blk1->col;
2413 	} else {
2414 	  col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
2415 				    (blk1->xMin - blk1->xMax)) *
2416 				   blk1->nColumns);
2417 	}
2418 	break;
2419       case 3:
2420 	if (blk0->yMax < blk1->yMin) {
2421 	  col2 = blk1->col + blk1->nColumns + 3;
2422 	} else if (blk1->yMin == blk1->yMax) {
2423 	  col2 = blk1->col;
2424 	} else {
2425 	  col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
2426 				    (blk1->yMin - blk1->yMax)) *
2427 				   blk1->nColumns);
2428 	}
2429 	break;
2430       }
2431       if (col2 > col1) {
2432 	col1 = col2;
2433       }
2434     }
2435     blk0->col = col1;
2436     for (line = blk0->lines; line; line = line->next) {
2437       for (j = 0; j <= line->len; ++j) {
2438 	line->col[j] += col1;
2439       }
2440     }
2441   }
2442 
2443 #if 0 // for debugging
2444   printf("*** blocks, after column assignment ***\n");
2445   for (blk = blkList; blk; blk = blk->next) {
2446     printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
2447 	   blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
2448 	   blk->nColumns);
2449     for (line = blk->lines; line; line = line->next) {
2450       printf("  line:\n");
2451       for (word0 = line->words; word0; word0 = word0->next) {
2452 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2453 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2454 	       word0->base, word0->fontSize, word0->spaceAfter);
2455 	for (i = 0; i < word0->len; ++i) {
2456 	  fputc(word0->text[i] & 0xff, stdout);
2457 	}
2458 	printf("'\n");
2459       }
2460     }
2461   }
2462   printf("\n");
2463 #endif
2464 
2465   //----- reading order sort
2466 
2467   // sort blocks into yx order (in preparation for reading order sort)
2468   qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpYXPrimaryRot);
2469 
2470   // compute space on left and right sides of each block
2471   for (i = 0; i < nBlocks; ++i) {
2472     blk0 = blocks[i];
2473     for (j = 0; j < nBlocks; ++j) {
2474       blk1 = blocks[j];
2475       if (blk1 != blk0) {
2476 	blk0->updatePriMinMax(blk1);
2477       }
2478     }
2479   }
2480 
2481 #if 0 // for debugging
2482   printf("*** blocks, after yx sort ***\n");
2483   for (i = 0; i < nBlocks; ++i) {
2484     blk = blocks[i];
2485     printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
2486 	   blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2487 	   blk->priMin, blk->priMax);
2488     for (line = blk->lines; line; line = line->next) {
2489       printf("  line:\n");
2490       for (word0 = line->words; word0; word0 = word0->next) {
2491 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2492 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2493 	       word0->base, word0->fontSize, word0->spaceAfter);
2494 	for (j = 0; j < word0->len; ++j) {
2495 	  fputc(word0->text[j] & 0xff, stdout);
2496 	}
2497 	printf("'\n");
2498       }
2499     }
2500   }
2501   printf("\n");
2502 #endif
2503 
2504   // build the flows
2505   //~ this needs to be adjusted for writing mode (vertical text)
2506   //~ this also needs to account for right-to-left column ordering
2507   blkArray = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
2508   memcpy(blkArray, blocks, nBlocks * sizeof(TextBlock *));
2509   flows = lastFlow = NULL;
2510   firstBlkIdx = 0;
2511   nBlocksLeft = nBlocks;
2512   while (nBlocksLeft > 0) {
2513 
2514     // find the upper-left-most block
2515     for (; !blkArray[firstBlkIdx]; ++firstBlkIdx) ;
2516     i = firstBlkIdx;
2517     blk = blkArray[i];
2518     for (j = firstBlkIdx + 1; j < nBlocks; ++j) {
2519       blk1 = blkArray[j];
2520       if (blk1) {
2521 	if (blk && blk->secondaryDelta(blk1) > 0) {
2522 	  break;
2523 	}
2524 	if (blk1->primaryCmp(blk) < 0) {
2525 	  i = j;
2526 	  blk = blk1;
2527 	}
2528       }
2529     }
2530     blkArray[i] = NULL;
2531     --nBlocksLeft;
2532     blk->next = NULL;
2533 
2534     // create a new flow, starting with the upper-left-most block
2535     flow = new TextFlow(this, blk);
2536     if (lastFlow) {
2537       lastFlow->next = flow;
2538     } else {
2539       flows = flow;
2540     }
2541     lastFlow = flow;
2542     fontSize = blk->lines->words->fontSize;
2543 
2544     // push the upper-left-most block on the stack
2545     blk->stackNext = NULL;
2546     blkStack = blk;
2547 
2548     // find the other blocks in this flow
2549     while (blkStack) {
2550 
2551       // find the upper-left-most block under (but within
2552       // maxBlockSpacing of) the top block on the stack
2553       blkSpace = maxBlockSpacing * blkStack->lines->words->fontSize;
2554       blk = NULL;
2555       i = -1;
2556       for (j = firstBlkIdx; j < nBlocks; ++j) {
2557 	blk1 = blkArray[j];
2558 	if (blk1) {
2559 	  if (blkStack->secondaryDelta(blk1) > blkSpace) {
2560 	    break;
2561 	  }
2562 	  if (blk && blk->secondaryDelta(blk1) > 0) {
2563 	    break;
2564 	  }
2565 	  if (blk1->isBelow(blkStack) &&
2566 	      (!blk || blk1->primaryCmp(blk) < 0)) {
2567 	    i = j;
2568 	    blk = blk1;
2569 	  }
2570 	}
2571       }
2572 
2573       // if a suitable block was found, add it to the flow and push it
2574       // onto the stack
2575       if (blk && flow->blockFits(blk, blkStack)) {
2576 	blkArray[i] = NULL;
2577 	--nBlocksLeft;
2578 	blk->next = NULL;
2579 	flow->addBlock(blk);
2580 	fontSize = blk->lines->words->fontSize;
2581 	blk->stackNext = blkStack;
2582 	blkStack = blk;
2583 
2584       // otherwise (if there is no block under the top block or the
2585       // block is not suitable), pop the stack
2586       } else {
2587 	blkStack = blkStack->stackNext;
2588       }
2589     }
2590   }
2591   gfree(blkArray);
2592 
2593 #if 0 // for debugging
2594   printf("*** flows ***\n");
2595   for (flow = flows; flow; flow = flow->next) {
2596     printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
2597 	   flow->xMin, flow->xMax, flow->yMin, flow->yMax,
2598 	   flow->priMin, flow->priMax);
2599     for (blk = flow->blocks; blk; blk = blk->next) {
2600       printf("  block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
2601 	     blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2602 	     blk->priMin, blk->priMax);
2603       for (line = blk->lines; line; line = line->next) {
2604 	printf("    line:\n");
2605 	for (word0 = line->words; word0; word0 = word0->next) {
2606 	  printf("      word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2607 		 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2608 		 word0->base, word0->fontSize, word0->spaceAfter);
2609 	  for (i = 0; i < word0->len; ++i) {
2610 	    fputc(word0->text[i] & 0xff, stdout);
2611 	  }
2612 	  printf("'\n");
2613 	}
2614       }
2615     }
2616   }
2617   printf("\n");
2618 #endif
2619 
2620   if (uMap) {
2621     uMap->decRefCnt();
2622   }
2623 }
2624 
findText(Unicode * s,int len,GBool startAtTop,GBool stopAtBottom,GBool startAtLast,GBool stopAtLast,GBool caseSensitive,GBool backward,double * xMin,double * yMin,double * xMax,double * yMax)2625 GBool TextPage::findText(Unicode *s, int len,
2626 			 GBool startAtTop, GBool stopAtBottom,
2627 			 GBool startAtLast, GBool stopAtLast,
2628 			 GBool caseSensitive, GBool backward,
2629 			 double *xMin, double *yMin,
2630 			 double *xMax, double *yMax) {
2631   TextBlock *blk;
2632   TextLine *line;
2633   Unicode *s2, *txt;
2634   Unicode *p;
2635   int txtSize, m, i, j, k;
2636   double xStart, yStart, xStop, yStop;
2637   double xMin0, yMin0, xMax0, yMax0;
2638   double xMin1, yMin1, xMax1, yMax1;
2639   GBool found;
2640 
2641   //~ needs to handle right-to-left text
2642 
2643   if (rawOrder) {
2644     return gFalse;
2645   }
2646 
2647   // convert the search string to uppercase
2648   if (!caseSensitive) {
2649     s2 = (Unicode *)gmallocn(len, sizeof(Unicode));
2650     for (i = 0; i < len; ++i) {
2651       s2[i] = unicodeToUpper(s[i]);
2652     }
2653   } else {
2654     s2 = s;
2655   }
2656 
2657   txt = NULL;
2658   txtSize = 0;
2659 
2660   xStart = yStart = xStop = yStop = 0;
2661   if (startAtLast && haveLastFind) {
2662     xStart = lastFindXMin;
2663     yStart = lastFindYMin;
2664   } else if (!startAtTop) {
2665     xStart = *xMin;
2666     yStart = *yMin;
2667   }
2668   if (stopAtLast && haveLastFind) {
2669     xStop = lastFindXMin;
2670     yStop = lastFindYMin;
2671   } else if (!stopAtBottom) {
2672     xStop = *xMax;
2673     yStop = *yMax;
2674   }
2675 
2676   found = gFalse;
2677   xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
2678   xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
2679 
2680   for (i = backward ? nBlocks - 1 : 0;
2681        backward ? i >= 0 : i < nBlocks;
2682        i += backward ? -1 : 1) {
2683     blk = blocks[i];
2684 
2685     // check: is the block above the top limit?
2686     if (!startAtTop && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
2687       continue;
2688     }
2689 
2690     // check: is the block below the bottom limit?
2691     if (!stopAtBottom && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
2692       break;
2693     }
2694 
2695     for (line = blk->lines; line; line = line->next) {
2696 
2697       // check: is the line above the top limit?
2698       if (!startAtTop &&
2699 	  (backward ? line->yMin > yStart : line->yMin < yStart)) {
2700 	continue;
2701       }
2702 
2703       // check: is the line below the bottom limit?
2704       if (!stopAtBottom &&
2705 	  (backward ? line->yMin < yStop : line->yMin > yStop)) {
2706 	continue;
2707       }
2708 
2709       // convert the line to uppercase
2710       m = line->len;
2711       if (!caseSensitive) {
2712 	if (m > txtSize) {
2713 	  txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
2714 	  txtSize = m;
2715 	}
2716 	for (k = 0; k < m; ++k) {
2717 	  txt[k] = unicodeToUpper(line->text[k]);
2718 	}
2719       } else {
2720 	txt = line->text;
2721       }
2722 
2723       // search each position in this line
2724       j = backward ? m - len : 0;
2725       p = txt + j;
2726       while (backward ? j >= 0 : j <= m - len) {
2727 
2728 	// compare the strings
2729 	for (k = 0; k < len; ++k) {
2730 	  if (p[k] != s2[k]) {
2731 	    break;
2732 	  }
2733 	}
2734 
2735 	// found it
2736 	if (k == len) {
2737 	  switch (line->rot) {
2738 	  case 0:
2739 	    xMin1 = line->edge[j];
2740 	    xMax1 = line->edge[j + len];
2741 	    yMin1 = line->yMin;
2742 	    yMax1 = line->yMax;
2743 	    break;
2744 	  case 1:
2745 	    xMin1 = line->xMin;
2746 	    xMax1 = line->xMax;
2747 	    yMin1 = line->edge[j];
2748 	    yMax1 = line->edge[j + len];
2749 	    break;
2750 	  case 2:
2751 	    xMin1 = line->edge[j + len];
2752 	    xMax1 = line->edge[j];
2753 	    yMin1 = line->yMin;
2754 	    yMax1 = line->yMax;
2755 	    break;
2756 	  case 3:
2757 	    xMin1 = line->xMin;
2758 	    xMax1 = line->xMax;
2759 	    yMin1 = line->edge[j + len];
2760 	    yMax1 = line->edge[j];
2761 	    break;
2762 	  }
2763 	  if (backward) {
2764 	    if ((startAtTop ||
2765 		 yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) &&
2766 		(stopAtBottom ||
2767 		 yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
2768 	      if (!found ||
2769 		  yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
2770 		xMin0 = xMin1;
2771 		xMax0 = xMax1;
2772 		yMin0 = yMin1;
2773 		yMax0 = yMax1;
2774 		found = gTrue;
2775 	      }
2776 	    }
2777 	  } else {
2778 	    if ((startAtTop ||
2779 		 yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
2780 		(stopAtBottom ||
2781 		 yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
2782 	      if (!found ||
2783 		  yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
2784 		xMin0 = xMin1;
2785 		xMax0 = xMax1;
2786 		yMin0 = yMin1;
2787 		yMax0 = yMax1;
2788 		found = gTrue;
2789 	      }
2790 	    }
2791 	  }
2792 	}
2793 	if (backward) {
2794 	  --j;
2795 	  --p;
2796 	} else {
2797 	  ++j;
2798 	  ++p;
2799 	}
2800       }
2801     }
2802   }
2803 
2804   if (!caseSensitive) {
2805     gfree(s2);
2806     gfree(txt);
2807   }
2808 
2809   if (found) {
2810     *xMin = xMin0;
2811     *xMax = xMax0;
2812     *yMin = yMin0;
2813     *yMax = yMax0;
2814     lastFindXMin = xMin0;
2815     lastFindYMin = yMin0;
2816     haveLastFind = gTrue;
2817     return gTrue;
2818   }
2819 
2820   return gFalse;
2821 }
2822 
getText(double xMin,double yMin,double xMax,double yMax)2823 GString *TextPage::getText(double xMin, double yMin,
2824 			   double xMax, double yMax) {
2825   GString *s;
2826   UnicodeMap *uMap;
2827   GBool isUnicode;
2828   TextBlock *blk;
2829   TextLine *line;
2830   TextLineFrag *frags;
2831   int nFrags, fragsSize;
2832   TextLineFrag *frag;
2833   char space[8], eol[16];
2834   int spaceLen, eolLen;
2835   int lastRot;
2836   double x, y;
2837   int col, idx0, idx1, i, j;
2838   GBool multiLine, oneRot;
2839 
2840   s = new GString();
2841 
2842   if (rawOrder) {
2843     return s;
2844   }
2845 
2846   // get the output encoding
2847   if (!(uMap = globalParams->getTextEncoding())) {
2848     return s;
2849   }
2850   isUnicode = uMap->isUnicode();
2851   spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
2852   eolLen = 0; // make gcc happy
2853   switch (globalParams->getTextEOL()) {
2854   case eolUnix:
2855     eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
2856     break;
2857   case eolDOS:
2858     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
2859     eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
2860     break;
2861   case eolMac:
2862     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
2863     break;
2864   }
2865 
2866   //~ writing mode (horiz/vert)
2867 
2868   // collect the line fragments that are in the rectangle
2869   fragsSize = 256;
2870   frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
2871   nFrags = 0;
2872   lastRot = -1;
2873   oneRot = gTrue;
2874   for (i = 0; i < nBlocks; ++i) {
2875     blk = blocks[i];
2876     if (xMin < blk->xMax && blk->xMin < xMax &&
2877 	yMin < blk->yMax && blk->yMin < yMax) {
2878       for (line = blk->lines; line; line = line->next) {
2879 	if (xMin < line->xMax && line->xMin < xMax &&
2880 	    yMin < line->yMax && line->yMin < yMax) {
2881 	  idx0 = idx1 = -1;
2882 	  switch (line->rot) {
2883 	  case 0:
2884 	    y = 0.5 * (line->yMin + line->yMax);
2885 	    if (yMin < y && y < yMax) {
2886 	      j = 0;
2887 	      while (j < line->len) {
2888 		if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
2889 		  idx0 = j;
2890 		  break;
2891 		}
2892 		++j;
2893 	      }
2894 	      j = line->len - 1;
2895 	      while (j >= 0) {
2896 		if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
2897 		  idx1 = j;
2898 		  break;
2899 		}
2900 		--j;
2901 	      }
2902 	    }
2903 	    break;
2904 	  case 1:
2905 	    x = 0.5 * (line->xMin + line->xMax);
2906 	    if (xMin < x && x < xMax) {
2907 	      j = 0;
2908 	      while (j < line->len) {
2909 		if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
2910 		  idx0 = j;
2911 		  break;
2912 		}
2913 		++j;
2914 	      }
2915 	      j = line->len - 1;
2916 	      while (j >= 0) {
2917 		if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
2918 		  idx1 = j;
2919 		  break;
2920 		}
2921 		--j;
2922 	      }
2923 	    }
2924 	    break;
2925 	  case 2:
2926 	    y = 0.5 * (line->yMin + line->yMax);
2927 	    if (yMin < y && y < yMax) {
2928 	      j = 0;
2929 	      while (j < line->len) {
2930 		if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
2931 		  idx0 = j;
2932 		  break;
2933 		}
2934 		++j;
2935 	      }
2936 	      j = line->len - 1;
2937 	      while (j >= 0) {
2938 		if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
2939 		  idx1 = j;
2940 		  break;
2941 		}
2942 		--j;
2943 	      }
2944 	    }
2945 	    break;
2946 	  case 3:
2947 	    x = 0.5 * (line->xMin + line->xMax);
2948 	    if (xMin < x && x < xMax) {
2949 	      j = 0;
2950 	      while (j < line->len) {
2951 		if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
2952 		  idx0 = j;
2953 		  break;
2954 		}
2955 		++j;
2956 	      }
2957 	      j = line->len - 1;
2958 	      while (j >= 0) {
2959 		if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
2960 		  idx1 = j;
2961 		  break;
2962 		}
2963 		--j;
2964 	      }
2965 	    }
2966 	    break;
2967 	  }
2968 	  if (idx0 >= 0 && idx1 >= 0) {
2969 	    if (nFrags == fragsSize) {
2970 	      fragsSize *= 2;
2971 	      frags = (TextLineFrag *)
2972 		          greallocn(frags, fragsSize, sizeof(TextLineFrag));
2973 	    }
2974 	    frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
2975 	    ++nFrags;
2976 	    if (lastRot >= 0 && line->rot != lastRot) {
2977 	      oneRot = gFalse;
2978 	    }
2979 	    lastRot = line->rot;
2980 	  }
2981 	}
2982       }
2983     }
2984   }
2985 
2986   // sort the fragments and generate the string
2987   if (nFrags > 0) {
2988 
2989     for (i = 0; i < nFrags; ++i) {
2990       frags[i].computeCoords(oneRot);
2991     }
2992     assignColumns(frags, nFrags, oneRot);
2993 
2994     // if all lines in the region have the same rotation, use it;
2995     // otherwise, use the page's primary rotation
2996     if (oneRot) {
2997       qsort(frags, nFrags, sizeof(TextLineFrag),
2998 	    &TextLineFrag::cmpYXLineRot);
2999     } else {
3000       qsort(frags, nFrags, sizeof(TextLineFrag),
3001 	    &TextLineFrag::cmpYXPrimaryRot);
3002     }
3003 
3004     col = 0;
3005     multiLine = gFalse;
3006     for (i = 0; i < nFrags; ++i) {
3007       frag = &frags[i];
3008 
3009       // insert a return
3010       if (frag->col < col ||
3011 	  (i > 0 && fabs(frag->base - frags[i-1].base) >
3012 	              maxIntraLineDelta * frags[i-1].line->words->fontSize)) {
3013 	s->append(eol, eolLen);
3014 	col = 0;
3015 	multiLine = gTrue;
3016       }
3017 
3018       // column alignment
3019       for (; col < frag->col; ++col) {
3020 	s->append(space, spaceLen);
3021       }
3022 
3023       // get the fragment text
3024       col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3025     }
3026 
3027     if (multiLine) {
3028       s->append(eol, eolLen);
3029     }
3030   }
3031 
3032   gfree(frags);
3033   uMap->decRefCnt();
3034 
3035   return s;
3036 }
3037 
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax)3038 GBool TextPage::findCharRange(int pos, int length,
3039 			      double *xMin, double *yMin,
3040 			      double *xMax, double *yMax) {
3041   TextBlock *blk;
3042   TextLine *line;
3043   TextWord *word;
3044   double xMin0, xMax0, yMin0, yMax0;
3045   double xMin1, xMax1, yMin1, yMax1;
3046   GBool first;
3047   int i, j0, j1;
3048 
3049   if (rawOrder) {
3050     return gFalse;
3051   }
3052 
3053   //~ this doesn't correctly handle:
3054   //~ - ranges split across multiple lines (the highlighted region
3055   //~   is the bounding box of all the parts of the range)
3056   //~ - cases where characters don't convert one-to-one into Unicode
3057   first = gTrue;
3058   xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3059   xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3060   for (i = 0; i < nBlocks; ++i) {
3061     blk = blocks[i];
3062     for (line = blk->lines; line; line = line->next) {
3063       for (word = line->words; word; word = word->next) {
3064 	if (pos < word->charPos + word->charLen &&
3065 	    word->charPos < pos + length) {
3066 	  j0 = pos - word->charPos;
3067 	  if (j0 < 0) {
3068 	    j0 = 0;
3069 	  }
3070 	  j1 = pos + length - 1 - word->charPos;
3071 	  if (j1 >= word->len) {
3072 	    j1 = word->len - 1;
3073 	  }
3074 	  switch (line->rot) {
3075 	  case 0:
3076 	    xMin1 = word->edge[j0];
3077 	    xMax1 = word->edge[j1 + 1];
3078 	    yMin1 = word->yMin;
3079 	    yMax1 = word->yMax;
3080 	    break;
3081 	  case 1:
3082 	    xMin1 = word->xMin;
3083 	    xMax1 = word->xMax;
3084 	    yMin1 = word->edge[j0];
3085 	    yMax1 = word->edge[j1 + 1];
3086 	    break;
3087 	  case 2:
3088 	    xMin1 = word->edge[j1 + 1];
3089 	    xMax1 = word->edge[j0];
3090 	    yMin1 = word->yMin;
3091 	    yMax1 = word->yMax;
3092 	    break;
3093 	  case 3:
3094 	    xMin1 = word->xMin;
3095 	    xMax1 = word->xMax;
3096 	    yMin1 = word->edge[j1 + 1];
3097 	    yMax1 = word->edge[j0];
3098 	    break;
3099 	  }
3100 	  if (first || xMin1 < xMin0) {
3101 	    xMin0 = xMin1;
3102 	  }
3103 	  if (first || xMax1 > xMax0) {
3104 	    xMax0 = xMax1;
3105 	  }
3106 	  if (first || yMin1 < yMin0) {
3107 	    yMin0 = yMin1;
3108 	  }
3109 	  if (first || yMax1 > yMax0) {
3110 	    yMax0 = yMax1;
3111 	  }
3112 	  first = gFalse;
3113 	}
3114       }
3115     }
3116   }
3117   if (!first) {
3118     *xMin = xMin0;
3119     *xMax = xMax0;
3120     *yMin = yMin0;
3121     *yMax = yMax0;
3122     return gTrue;
3123   }
3124   return gFalse;
3125 }
3126 
dump(void * outputStream,TextOutputFunc outputFunc,GBool physLayout)3127 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
3128 		    GBool physLayout) {
3129   UnicodeMap *uMap;
3130   TextFlow *flow;
3131   TextBlock *blk;
3132   TextLine *line;
3133   TextLineFrag *frags;
3134   TextWord *word;
3135   int nFrags, fragsSize;
3136   TextLineFrag *frag;
3137   char space[8], eol[16], eop[8];
3138   int spaceLen, eolLen, eopLen;
3139   GBool pageBreaks;
3140   GString *s;
3141   int col, i, d, n;
3142 
3143   // get the output encoding
3144   if (!(uMap = globalParams->getTextEncoding())) {
3145     return;
3146   }
3147   spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3148   eolLen = 0; // make gcc happy
3149   switch (globalParams->getTextEOL()) {
3150   case eolUnix:
3151     eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3152     break;
3153   case eolDOS:
3154     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3155     eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3156     break;
3157   case eolMac:
3158     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3159     break;
3160   }
3161   eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
3162   pageBreaks = globalParams->getTextPageBreaks();
3163 
3164   //~ writing mode (horiz/vert)
3165 
3166   // output the page in raw (content stream) order
3167   if (rawOrder) {
3168 
3169     for (word = rawWords; word; word = word->next) {
3170       s = new GString();
3171       dumpFragment(word->text, word->len, uMap, s);
3172       (*outputFunc)(outputStream, s->getCString(), s->getLength());
3173       delete s;
3174       if (word->next &&
3175 	  fabs(word->next->base - word->base) <
3176 	    maxIntraLineDelta * word->fontSize) {
3177 	if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
3178 	  (*outputFunc)(outputStream, space, spaceLen);
3179 	}
3180       } else {
3181 	(*outputFunc)(outputStream, eol, eolLen);
3182       }
3183     }
3184 
3185   // output the page, maintaining the original physical layout
3186   } else if (physLayout) {
3187 
3188     // collect the line fragments for the page and sort them
3189     fragsSize = 256;
3190     frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
3191     nFrags = 0;
3192     for (i = 0; i < nBlocks; ++i) {
3193       blk = blocks[i];
3194       for (line = blk->lines; line; line = line->next) {
3195 	if (nFrags == fragsSize) {
3196 	  fragsSize *= 2;
3197 	  frags = (TextLineFrag *)greallocn(frags,
3198 					    fragsSize, sizeof(TextLineFrag));
3199 	}
3200 	frags[nFrags].init(line, 0, line->len);
3201 	frags[nFrags].computeCoords(gTrue);
3202 	++nFrags;
3203       }
3204     }
3205     qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
3206 
3207 #if 0 // for debugging
3208     printf("*** line fragments ***\n");
3209     for (i = 0; i < nFrags; ++i) {
3210       frag = &frags[i];
3211       printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
3212 	     frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base);
3213       for (n = 0; n < frag->len; ++n) {
3214 	fputc(frag->line->text[frag->start + n] & 0xff, stdout);
3215       }
3216       printf("'\n");
3217     }
3218     printf("\n");
3219 #endif
3220 
3221     // generate output
3222     col = 0;
3223     for (i = 0; i < nFrags; ++i) {
3224       frag = &frags[i];
3225 
3226       // column alignment
3227       for (; col < frag->col; ++col) {
3228 	(*outputFunc)(outputStream, space, spaceLen);
3229       }
3230 
3231       // print the line
3232       s = new GString();
3233       col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3234       (*outputFunc)(outputStream, s->getCString(), s->getLength());
3235       delete s;
3236 
3237       // print one or more returns if necessary
3238       if (i == nFrags - 1 ||
3239 	  frags[i+1].col < col ||
3240 	  fabs(frags[i+1].base - frag->base) >
3241 	    maxIntraLineDelta * frag->line->words->fontSize) {
3242 	if (i < nFrags - 1) {
3243 	  d = (int)((frags[i+1].base - frag->base) /
3244 		    frag->line->words->fontSize);
3245 	  if (d < 1) {
3246 	    d = 1;
3247 	  } else if (d > 5) {
3248 	    d = 5;
3249 	  }
3250 	} else {
3251 	  d = 1;
3252 	}
3253 	for (; d > 0; --d) {
3254 	  (*outputFunc)(outputStream, eol, eolLen);
3255 	}
3256 	col = 0;
3257       }
3258     }
3259 
3260     gfree(frags);
3261 
3262   // output the page, "undoing" the layout
3263   } else {
3264     for (flow = flows; flow; flow = flow->next) {
3265       for (blk = flow->blocks; blk; blk = blk->next) {
3266 	for (line = blk->lines; line; line = line->next) {
3267 	  n = line->len;
3268 	  if (line->hyphenated && (line->next || blk->next)) {
3269 	    --n;
3270 	  }
3271 	  s = new GString();
3272 	  dumpFragment(line->text, n, uMap, s);
3273 	  (*outputFunc)(outputStream, s->getCString(), s->getLength());
3274 	  delete s;
3275 	  if (!line->hyphenated) {
3276 	    if (line->next) {
3277 	      (*outputFunc)(outputStream, space, spaceLen);
3278 	    } else if (blk->next) {
3279 	      //~ this is a bit of a kludge - we should really do a more
3280 	      //~ intelligent determination of paragraphs
3281 	      if (blk->next->lines->words->fontSize ==
3282 		  blk->lines->words->fontSize) {
3283 		(*outputFunc)(outputStream, space, spaceLen);
3284 	      } else {
3285 		(*outputFunc)(outputStream, eol, eolLen);
3286 	      }
3287 	    }
3288 	  }
3289 	}
3290       }
3291       (*outputFunc)(outputStream, eol, eolLen);
3292       (*outputFunc)(outputStream, eol, eolLen);
3293     }
3294   }
3295 
3296   // end of page
3297   if (pageBreaks) {
3298     (*outputFunc)(outputStream, eop, eopLen);
3299   }
3300 
3301   uMap->decRefCnt();
3302 }
3303 
assignColumns(TextLineFrag * frags,int nFrags,GBool oneRot)3304 void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
3305   TextLineFrag *frag0, *frag1;
3306   int rot, col1, col2, i, j, k;
3307 
3308   // all text in the region has the same rotation -- recompute the
3309   // column numbers based only on the text in the region
3310   if (oneRot) {
3311     qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
3312     rot = frags[0].line->rot;
3313     for (i = 0; i < nFrags; ++i) {
3314       frag0 = &frags[i];
3315       col1 = 0;
3316       for (j = 0; j < i; ++j) {
3317 	frag1 = &frags[j];
3318 	col2 = 0; // make gcc happy
3319 	switch (rot) {
3320 	case 0:
3321 	  if (frag0->xMin >= frag1->xMax) {
3322 	    col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3323 				 frag1->line->col[frag1->start]) + 1;
3324 	  } else {
3325 	    for (k = frag1->start;
3326 		 k < frag1->start + frag1->len &&
3327 		   frag0->xMin >= 0.5 * (frag1->line->edge[k] +
3328 					 frag1->line->edge[k+1]);
3329 		 ++k) ;
3330 	    col2 = frag1->col +
3331 	           frag1->line->col[k] - frag1->line->col[frag1->start];
3332 	  }
3333 	  break;
3334 	case 1:
3335 	  if (frag0->yMin >= frag1->yMax) {
3336 	    col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3337 				 frag1->line->col[frag1->start]) + 1;
3338 	  } else {
3339 	    for (k = frag1->start;
3340 		 k < frag1->start + frag1->len &&
3341 		   frag0->yMin >= 0.5 * (frag1->line->edge[k] +
3342 					 frag1->line->edge[k+1]);
3343 		 ++k) ;
3344 	    col2 = frag1->col +
3345 	           frag1->line->col[k] - frag1->line->col[frag1->start];
3346 	  }
3347 	  break;
3348 	case 2:
3349 	  if (frag0->xMax <= frag1->xMin) {
3350 	    col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3351 				 frag1->line->col[frag1->start]) + 1;
3352 	  } else {
3353 	    for (k = frag1->start;
3354 		 k < frag1->start + frag1->len &&
3355 		   frag0->xMax <= 0.5 * (frag1->line->edge[k] +
3356 					 frag1->line->edge[k+1]);
3357 		 ++k) ;
3358 	    col2 = frag1->col +
3359 	           frag1->line->col[k] - frag1->line->col[frag1->start];
3360 	  }
3361 	  break;
3362 	case 3:
3363 	  if (frag0->yMax <= frag1->yMin) {
3364 	    col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3365 				 frag1->line->col[frag1->start]) + 1;
3366 	  } else {
3367 	    for (k = frag1->start;
3368 		 k < frag1->start + frag1->len &&
3369 		   frag0->yMax <= 0.5 * (frag1->line->edge[k] +
3370 					 frag1->line->edge[k+1]);
3371 		 ++k) ;
3372 	    col2 = frag1->col +
3373 	           frag1->line->col[k] - frag1->line->col[frag1->start];
3374 	  }
3375 	  break;
3376 	}
3377 	if (col2 > col1) {
3378 	  col1 = col2;
3379 	}
3380       }
3381       frag0->col = col1;
3382     }
3383 
3384   // the region includes text at different rotations -- use the
3385   // globally assigned column numbers, offset by the minimum column
3386   // number (i.e., shift everything over to column 0)
3387   } else {
3388     col1 = frags[0].col;
3389     for (i = 1; i < nFrags; ++i) {
3390       if (frags[i].col < col1) {
3391 	col1 = frags[i].col;
3392       }
3393     }
3394     for (i = 0; i < nFrags; ++i) {
3395       frags[i].col -= col1;
3396     }
3397   }
3398 }
3399 
dumpFragment(Unicode * text,int len,UnicodeMap * uMap,GString * s)3400 int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
3401 			   GString *s) {
3402   char lre[8], rle[8], popdf[8], buf[8];
3403   int lreLen, rleLen, popdfLen, n;
3404   int nCols, i, j, k;
3405 
3406   nCols = 0;
3407 
3408   if (uMap->isUnicode()) {
3409 
3410     lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
3411     rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
3412     popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
3413 
3414     if (primaryLR) {
3415 
3416       i = 0;
3417       while (i < len) {
3418 	// output a left-to-right section
3419 	for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
3420 	for (k = i; k < j; ++k) {
3421 	  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3422 	  s->append(buf, n);
3423 	  ++nCols;
3424 	}
3425 	i = j;
3426 	// output a right-to-left section
3427 	for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ;
3428 	if (j > i) {
3429 	  s->append(rle, rleLen);
3430 	  for (k = j - 1; k >= i; --k) {
3431 	    n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3432 	    s->append(buf, n);
3433 	    ++nCols;
3434 	  }
3435 	  s->append(popdf, popdfLen);
3436 	  i = j;
3437 	}
3438       }
3439 
3440     } else {
3441 
3442       s->append(rle, rleLen);
3443       i = len - 1;
3444       while (i >= 0) {
3445 	// output a right-to-left section
3446 	for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ;
3447 	for (k = i; k > j; --k) {
3448 	  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3449 	  s->append(buf, n);
3450 	  ++nCols;
3451 	}
3452 	i = j;
3453 	// output a left-to-right section
3454 	for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
3455 	if (j < i) {
3456 	  s->append(lre, lreLen);
3457 	  for (k = j + 1; k <= i; ++k) {
3458 	    n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3459 	    s->append(buf, n);
3460 	    ++nCols;
3461 	  }
3462 	  s->append(popdf, popdfLen);
3463 	  i = j;
3464 	}
3465       }
3466       s->append(popdf, popdfLen);
3467 
3468     }
3469 
3470   } else {
3471     for (i = 0; i < len; ++i) {
3472       n = uMap->mapUnicode(text[i], buf, sizeof(buf));
3473       s->append(buf, n);
3474       nCols += n;
3475     }
3476   }
3477 
3478   return nCols;
3479 }
3480 
3481 #if TEXTOUT_WORD_LIST
makeWordList(GBool physLayout)3482 TextWordList *TextPage::makeWordList(GBool physLayout) {
3483   return new TextWordList(this, physLayout);
3484 }
3485 #endif
3486 
3487 //------------------------------------------------------------------------
3488 // TextOutputDev
3489 //------------------------------------------------------------------------
3490 
outputToFile(void * stream,char * text,int len)3491 static void outputToFile(void *stream, char *text, int len) {
3492   fwrite(text, 1, len, (FILE *)stream);
3493 }
3494 
TextOutputDev(char * fileName,GBool physLayoutA,GBool rawOrderA,GBool append)3495 TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
3496 			     GBool rawOrderA, GBool append) {
3497   text = NULL;
3498   physLayout = physLayoutA;
3499   rawOrder = rawOrderA;
3500   ok = gTrue;
3501 
3502   // open file
3503   needClose = gFalse;
3504   if (fileName) {
3505     if (!strcmp(fileName, "-")) {
3506       outputStream = stdout;
3507 #ifdef WIN32
3508       // keep DOS from munging the end-of-line characters
3509       setmode(fileno(stdout), O_BINARY);
3510 #endif
3511     } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
3512       needClose = gTrue;
3513     } else {
3514       error(-1, "Couldn't open text file '%s'", fileName);
3515       ok = gFalse;
3516       return;
3517     }
3518     outputFunc = &outputToFile;
3519   } else {
3520     outputStream = NULL;
3521   }
3522 
3523   // set up text object
3524   text = new TextPage(rawOrderA);
3525 }
3526 
TextOutputDev(TextOutputFunc func,void * stream,GBool physLayoutA,GBool rawOrderA)3527 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
3528 			     GBool physLayoutA, GBool rawOrderA) {
3529   outputFunc = func;
3530   outputStream = stream;
3531   needClose = gFalse;
3532   physLayout = physLayoutA;
3533   rawOrder = rawOrderA;
3534   text = new TextPage(rawOrderA);
3535   ok = gTrue;
3536 }
3537 
~TextOutputDev()3538 TextOutputDev::~TextOutputDev() {
3539   if (needClose) {
3540 #ifdef MACOS
3541     ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
3542 #endif
3543     fclose((FILE *)outputStream);
3544   }
3545   if (text) {
3546     delete text;
3547   }
3548 }
3549 
startPage(int pageNum,GfxState * state)3550 void TextOutputDev::startPage(int pageNum, GfxState *state)
3551 {
3552   text->startPage(state);
3553 }
3554 
endPage()3555 void TextOutputDev::endPage() {
3556   text->endPage();
3557   text->coalesce(physLayout);
3558   if (outputStream) {
3559     text->dump(outputStream, outputFunc, physLayout);
3560   }
3561 }
3562 
updateFont(GfxState * state)3563 void TextOutputDev::updateFont(GfxState *state) {
3564   text->updateFont(state);
3565 }
3566 
beginString(GfxState * state,GString * s)3567 void TextOutputDev::beginString(GfxState *state, GString *s) {
3568 }
3569 
endString(GfxState * state)3570 void TextOutputDev::endString(GfxState *state) {
3571 }
3572 
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode c,int nBytes,Unicode * u,int uLen)3573 void TextOutputDev::drawChar(GfxState *state, double x, double y,
3574 			     double dx, double dy,
3575 			     double originX, double originY,
3576 			     CharCode c, int nBytes, Unicode *u, int uLen) {
3577   text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
3578 }
3579 
findText(Unicode * s,int len,GBool startAtTop,GBool stopAtBottom,GBool startAtLast,GBool stopAtLast,GBool caseSensitive,GBool backward,double * xMin,double * yMin,double * xMax,double * yMax)3580 GBool TextOutputDev::findText(Unicode *s, int len,
3581 			      GBool startAtTop, GBool stopAtBottom,
3582 			      GBool startAtLast, GBool stopAtLast,
3583 			      GBool caseSensitive, GBool backward,
3584 			      double *xMin, double *yMin,
3585 			      double *xMax, double *yMax) {
3586   return text->findText(s, len, startAtTop, stopAtBottom,
3587 			startAtLast, stopAtLast, caseSensitive, backward,
3588 			xMin, yMin, xMax, yMax);
3589 }
3590 
getText(double xMin,double yMin,double xMax,double yMax)3591 GString *TextOutputDev::getText(double xMin, double yMin,
3592 				double xMax, double yMax) {
3593   return text->getText(xMin, yMin, xMax, yMax);
3594 }
3595 
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax)3596 GBool TextOutputDev::findCharRange(int pos, int length,
3597 				   double *xMin, double *yMin,
3598 				   double *xMax, double *yMax) {
3599   return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
3600 }
3601 
3602 #if TEXTOUT_WORD_LIST
makeWordList()3603 TextWordList *TextOutputDev::makeWordList() {
3604   return text->makeWordList(physLayout);
3605 }
3606 #endif
3607 
takeText()3608 TextPage *TextOutputDev::takeText() {
3609   TextPage *ret;
3610 
3611   ret = text;
3612   text = new TextPage(rawOrder);
3613   return ret;
3614 }
3615