1 //========================================================================
2 //
3 // TextOutputDev.cc
4 //
5 // Copyright 1997-2014 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 #include <aconf.h>
10 
11 #ifdef USE_GCC_PRAGMAS
12 #pragma implementation
13 #endif
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <stddef.h>
18 #include <math.h>
19 #include <ctype.h>
20 #ifdef _WIN32
21 #include <fcntl.h> // for O_BINARY
22 #include <io.h>    // for setmode
23 #endif
24 #include "gmem.h"
25 #include "GString.h"
26 #include "GList.h"
27 #include "config.h"
28 #include "Error.h"
29 #include "GlobalParams.h"
30 #include "UnicodeMap.h"
31 #include "UnicodeTypeTable.h"
32 #include "GfxState.h"
33 #include "Link.h"
34 #include "TextOutputDev.h"
35 
36 //------------------------------------------------------------------------
37 // parameters
38 //------------------------------------------------------------------------
39 
40 // Size of bins used for horizontal and vertical profiles is
41 // splitPrecisionMul * minFontSize.
42 #define splitPrecisionMul 0.05
43 
44 // Minimum allowed split precision.
45 #define minSplitPrecision 0.01
46 
47 // yMin and yMax (or xMin and xMax for rot=1,3) are adjusted by this
48 // fraction of the text height, to allow for slightly overlapping
49 // lines (or large ascent/descent values).
50 #define ascentAdjustFactor 0
51 #define descentAdjustFactor 0.35
52 
53 // Gaps larger than max{gap} - splitGapSlack * avgFontSize are
54 // considered to be equivalent.
55 #define splitGapSlack 0.2
56 
57 // The vertical gap threshold (minimum gap required to split
58 // vertically) depends on the (approximate) number of lines in the
59 // block:
60 //   threshold = (max + slope * nLines) * avgFontSize
61 // with a min value of vertGapThresholdMin * avgFontSize.
62 #define vertGapThresholdMin 0.8
63 #define vertGapThresholdMax 3
64 #define vertGapThresholdSlope -0.5
65 
66 // Vertical gap threshold for table mode.
67 #define vertGapThresholdTableMin 0.2
68 #define vertGapThresholdTableMax 0.5
69 #define vertGapThresholdTableSlope -0.02
70 
71 // A large character has a font size larger than
72 // largeCharThreshold * avgFontSize.
73 #define largeCharThreshold 1.5
74 
75 // A block will be split vertically only if the resulting chunk
76 // widths are greater than vertSplitChunkThreshold * avgFontSize.
77 #define vertSplitChunkThreshold 2
78 
79 // Max difference in primary,secondary coordinates (as a fraction of
80 // the font size) allowed for duplicated text (fake boldface, drop
81 // shadows) which is to be discarded.
82 #define dupMaxPriDelta 0.1
83 #define dupMaxSecDelta 0.2
84 
85 // Inter-character spacing that varies by less than this multiple of
86 // font size is assumed to be equivalent.
87 #define uniformSpacing 0.07
88 
89 // Typical word spacing, as a fraction of font size.  This will be
90 // added to the minimum inter-character spacing, to account for wide
91 // character spacing.
92 #define wordSpacing 0.1
93 
94 // Minimum paragraph indent from left margin, as a fraction of font
95 // size.
96 #define minParagraphIndent 0.5
97 
98 // If the space between two lines is greater than
99 // paragraphSpacingThreshold * avgLineSpacing, start a new paragraph.
100 #define paragraphSpacingThreshold 1.2
101 
102 // If font size changes by at least this much (measured in points)
103 // between lines, start a new paragraph.
104 #define paragraphFontSizeDelta 1
105 
106 // Spaces at the start of a line in physical layout mode are this wide
107 // (as a multiple of font size).
108 #define physLayoutSpaceWidth 0.33
109 
110 // Table cells (TextColumns) are allowed to overlap by this much
111 // in table layout mode (as a fraction of cell width or height).
112 #define tableCellOverlapSlack 0.05
113 
114 // Primary axis delta which will cause a line break in raw mode
115 // (as a fraction of font size).
116 #define rawModeLineDelta 0.5
117 
118 // Secondary axis delta which will cause a word break in raw mode
119 // (as a fraction of font size).
120 #define rawModeWordSpacing 0.15
121 
122 // Secondary axis overlap which will cause a line break in raw mode
123 // (as a fraction of font size).
124 #define rawModeCharOverlap 0.2
125 
126 // Max spacing (as a multiple of font size) allowed between the end of
127 // a line and a clipped character to be included in that line.
128 #define clippedTextMaxWordSpace 0.5
129 
130 // Max width of underlines (in points).
131 #define maxUnderlineWidth 3
132 
133 // Max horizontal distance between edge of word and start of underline
134 // (as a fraction of font size).
135 #define underlineSlack 0.2
136 
137 // Max vertical distance between baseline of word and start of
138 // underline (as a fraction of font size).
139 #define underlineBaselineSlack 0.2
140 
141 // Max distance between edge of text and edge of link border (as a
142 // fraction of font size).
143 #define hyperlinkSlack 0.2
144 
145 //------------------------------------------------------------------------
146 // TextChar
147 //------------------------------------------------------------------------
148 
149 class TextChar {
150 public:
151 
152   TextChar(Unicode cA, int charPosA, int charLenA,
153 	   double xMinA, double yMinA, double xMaxA, double yMaxA,
154 	   int rotA, GBool clippedA, GBool invisibleA,
155 	   TextFontInfo *fontA, double fontSizeA,
156 	   double colorRA, double colorGA, double colorBA);
157 
158   static int cmpX(const void *p1, const void *p2);
159   static int cmpY(const void *p1, const void *p2);
160 
161   Unicode c;
162   int charPos;
163   int charLen;
164   double xMin, yMin, xMax, yMax;
165   Guchar rot;
166   char clipped;
167   char invisible;
168   TextFontInfo *font;
169   double fontSize;
170   double colorR,
171          colorG,
172          colorB;
173 };
174 
TextChar(Unicode cA,int charPosA,int charLenA,double xMinA,double yMinA,double xMaxA,double yMaxA,int rotA,GBool clippedA,GBool invisibleA,TextFontInfo * fontA,double fontSizeA,double colorRA,double colorGA,double colorBA)175 TextChar::TextChar(Unicode cA, int charPosA, int charLenA,
176 		   double xMinA, double yMinA, double xMaxA, double yMaxA,
177 		   int rotA, GBool clippedA, GBool invisibleA,
178 		   TextFontInfo *fontA, double fontSizeA,
179 		   double colorRA, double colorGA, double colorBA) {
180   double t;
181 
182   c = cA;
183   charPos = charPosA;
184   charLen = charLenA;
185   xMin = xMinA;
186   yMin = yMinA;
187   xMax = xMaxA;
188   yMax = yMaxA;
189   // this can happen with vertical writing mode, or with odd values
190   // for the char/word spacing parameters
191   if (xMin > xMax) {
192     t = xMin; xMin = xMax; xMax = t;
193   }
194   if (yMin > yMax) {
195     t = yMin; yMin = yMax; yMax = t;
196   }
197   rot = (Guchar)rotA;
198   clipped = (char)clippedA;
199   invisible = (char)invisibleA;
200   font = fontA;
201   fontSize = fontSizeA;
202   colorR = colorRA;
203   colorG = colorGA;
204   colorB = colorBA;
205 }
206 
cmpX(const void * p1,const void * p2)207 int TextChar::cmpX(const void *p1, const void *p2) {
208   const TextChar *ch1 = *(const TextChar **)p1;
209   const TextChar *ch2 = *(const TextChar **)p2;
210 
211   if (ch1->xMin < ch2->xMin) {
212     return -1;
213   } else if (ch1->xMin > ch2->xMin) {
214     return 1;
215   } else {
216     return 0;
217   }
218 }
219 
cmpY(const void * p1,const void * p2)220 int TextChar::cmpY(const void *p1, const void *p2) {
221   const TextChar *ch1 = *(const TextChar **)p1;
222   const TextChar *ch2 = *(const TextChar **)p2;
223 
224   if (ch1->yMin < ch2->yMin) {
225     return -1;
226   } else if (ch1->yMin > ch2->yMin) {
227     return 1;
228   } else {
229     return 0;
230   }
231 }
232 
233 //------------------------------------------------------------------------
234 // TextBlock
235 //------------------------------------------------------------------------
236 
237 enum TextBlockType {
238   blkVertSplit,
239   blkHorizSplit,
240   blkLeaf
241 };
242 
243 enum TextBlockTag {
244   blkTagMulticolumn,
245   blkTagColumn,
246   blkTagLine
247 };
248 
249 class TextBlock {
250 public:
251 
252   TextBlock(TextBlockType typeA, int rotA);
253   ~TextBlock();
254   void addChild(TextBlock *child);
255   void addChild(TextChar *child);
256   void prependChild(TextChar *child);
257   void updateBounds(int childIdx);
258 
259   TextBlockType type;
260   TextBlockTag tag;
261   int rot;
262   double xMin, yMin, xMax, yMax;
263   GBool smallSplit;		// true for blkVertSplit/blkHorizSplit
264 				//   where the gap size is small
265   GList *children;		// for blkLeaf, children are TextWord;
266 				//   for others, children are TextBlock
267 };
268 
TextBlock(TextBlockType typeA,int rotA)269 TextBlock::TextBlock(TextBlockType typeA, int rotA) {
270   type = typeA;
271   tag = blkTagMulticolumn;
272   rot = rotA;
273   xMin = yMin = xMax = yMax = 0;
274   smallSplit = gFalse;
275   children = new GList();
276 }
277 
~TextBlock()278 TextBlock::~TextBlock() {
279   if (type == blkLeaf) {
280     delete children;
281   } else {
282     deleteGList(children, TextBlock);
283   }
284 }
285 
addChild(TextBlock * child)286 void TextBlock::addChild(TextBlock *child) {
287   if (children->getLength() == 0) {
288     xMin = child->xMin;
289     yMin = child->yMin;
290     xMax = child->xMax;
291     yMax = child->yMax;
292   } else {
293     if (child->xMin < xMin) {
294       xMin = child->xMin;
295     }
296     if (child->yMin < yMin) {
297       yMin = child->yMin;
298     }
299     if (child->xMax > xMax) {
300       xMax = child->xMax;
301     }
302     if (child->yMax > yMax) {
303       yMax = child->yMax;
304     }
305   }
306   children->append(child);
307 }
308 
addChild(TextChar * child)309 void TextBlock::addChild(TextChar *child) {
310   if (children->getLength() == 0) {
311     xMin = child->xMin;
312     yMin = child->yMin;
313     xMax = child->xMax;
314     yMax = child->yMax;
315   } else {
316     if (child->xMin < xMin) {
317       xMin = child->xMin;
318     }
319     if (child->yMin < yMin) {
320       yMin = child->yMin;
321     }
322     if (child->xMax > xMax) {
323       xMax = child->xMax;
324     }
325     if (child->yMax > yMax) {
326       yMax = child->yMax;
327     }
328   }
329   children->append(child);
330 }
331 
prependChild(TextChar * child)332 void TextBlock::prependChild(TextChar *child) {
333   if (children->getLength() == 0) {
334     xMin = child->xMin;
335     yMin = child->yMin;
336     xMax = child->xMax;
337     yMax = child->yMax;
338   } else {
339     if (child->xMin < xMin) {
340       xMin = child->xMin;
341     }
342     if (child->yMin < yMin) {
343       yMin = child->yMin;
344     }
345     if (child->xMax > xMax) {
346       xMax = child->xMax;
347     }
348     if (child->yMax > yMax) {
349       yMax = child->yMax;
350     }
351   }
352   children->insert(0, child);
353 }
354 
updateBounds(int childIdx)355 void TextBlock::updateBounds(int childIdx) {
356   TextBlock *child;
357 
358   child = (TextBlock *)children->get(childIdx);
359   if (child->xMin < xMin) {
360     xMin = child->xMin;
361   }
362   if (child->yMin < yMin) {
363     yMin = child->yMin;
364   }
365   if (child->xMax > xMax) {
366     xMax = child->xMax;
367   }
368   if (child->yMax > yMax) {
369     yMax = child->yMax;
370   }
371 }
372 
373 //------------------------------------------------------------------------
374 // TextUnderline
375 //------------------------------------------------------------------------
376 
377 class TextUnderline {
378 public:
379 
TextUnderline(double x0A,double y0A,double x1A,double y1A)380   TextUnderline(double x0A, double y0A, double x1A, double y1A)
381     { x0 = x0A; y0 = y0A; x1 = x1A; y1 = y1A; horiz = y0 == y1; }
~TextUnderline()382   ~TextUnderline() {}
383 
384   double x0, y0, x1, y1;
385   GBool horiz;
386 };
387 
388 //------------------------------------------------------------------------
389 // TextLink
390 //------------------------------------------------------------------------
391 
392 class TextLink {
393 public:
394 
TextLink(double xMinA,double yMinA,double xMaxA,double yMaxA,GString * uriA)395   TextLink(double xMinA, double yMinA, double xMaxA, double yMaxA,
396 	   GString *uriA)
397     { xMin = xMinA; yMin = yMinA; xMax = xMaxA; yMax = yMaxA; uri = uriA; }
398   ~TextLink();
399 
400   double xMin, yMin, xMax, yMax;
401   GString *uri;
402 };
403 
~TextLink()404 TextLink::~TextLink() {
405   if (uri) {
406     delete uri;
407   }
408 }
409 
410 //------------------------------------------------------------------------
411 // TextOutputControl
412 //------------------------------------------------------------------------
413 
TextOutputControl()414 TextOutputControl::TextOutputControl() {
415   mode = textOutReadingOrder;
416   fixedPitch = 0;
417   fixedLineSpacing = 0;
418   html = gFalse;
419   clipText = gFalse;
420 }
421 
422 
423 //------------------------------------------------------------------------
424 // TextFontInfo
425 //------------------------------------------------------------------------
426 
TextFontInfo(GfxState * state)427 TextFontInfo::TextFontInfo(GfxState *state) {
428   GfxFont *gfxFont;
429 
430   gfxFont = state->getFont();
431   if (gfxFont) {
432     fontID = *gfxFont->getID();
433     ascent = gfxFont->getAscent();
434     descent = gfxFont->getDescent();
435     // "odd" ascent/descent values cause trouble more often than not
436     // (in theory these could be legitimate values for oddly designed
437     // fonts -- but they are more often due to buggy PDF generators)
438     // (values that are too small are a different issue -- those seem
439     // to be more commonly legitimate)
440     if (ascent > 1) {
441       ascent = 0.75;
442     }
443     if (descent < -0.5) {
444       descent = -0.25;
445     }
446   } else {
447     fontID.num = -1;
448     fontID.gen = -1;
449     ascent = 0.75;
450     descent = -0.25;
451   }
452   fontName = (gfxFont && gfxFont->getName()) ? gfxFont->getName()->copy()
453                                              : (GString *)NULL;
454   flags = gfxFont ? gfxFont->getFlags() : 0;
455   mWidth = 0;
456   if (gfxFont && !gfxFont->isCIDFont()) {
457     char *name;
458     int code;
459     for (code = 0; code < 256; ++code) {
460       if ((name = ((Gfx8BitFont *)gfxFont)->getCharName(code)) &&
461 	  name[0] == 'm' && name[1] == '\0') {
462 	mWidth = ((Gfx8BitFont *)gfxFont)->getWidth(code);
463 	break;
464       }
465     }
466   }
467 }
468 
~TextFontInfo()469 TextFontInfo::~TextFontInfo() {
470   if (fontName) {
471     delete fontName;
472   }
473 }
474 
matches(GfxState * state)475 GBool TextFontInfo::matches(GfxState *state) {
476   Ref *id;
477 
478   if (!state->getFont()) {
479     return gFalse;
480   }
481   id = state->getFont()->getID();
482   return id->num == fontID.num && id->gen == fontID.gen;
483 }
484 
485 //------------------------------------------------------------------------
486 // TextWord
487 //------------------------------------------------------------------------
488 
489 // Build a TextWord object, using chars[start .. start+len-1].
490 // (If rot >= 2, the chars list is in reverse order.)
TextWord(GList * chars,int start,int lenA,int rotA,GBool spaceAfterA)491 TextWord::TextWord(GList *chars, int start, int lenA,
492 		   int rotA, GBool spaceAfterA) {
493   TextChar *ch;
494   int i;
495 
496   rot = rotA;
497   len = lenA;
498   text = (Unicode *)gmallocn(len, sizeof(Unicode));
499   edge = (double *)gmallocn(len + 1, sizeof(double));
500   charPos = (int *)gmallocn(len + 1, sizeof(int));
501   switch (rot) {
502   case 0:
503   default:
504     ch = (TextChar *)chars->get(start);
505     xMin = ch->xMin;
506     yMin = ch->yMin;
507     yMax = ch->yMax;
508     ch = (TextChar *)chars->get(start + len - 1);
509     xMax = ch->xMax;
510     break;
511   case 1:
512     ch = (TextChar *)chars->get(start);
513     xMin = ch->xMin;
514     xMax = ch->xMax;
515     yMin = ch->yMin;
516     ch = (TextChar *)chars->get(start + len - 1);
517     yMax = ch->yMax;
518     break;
519   case 2:
520     ch = (TextChar *)chars->get(start);
521     xMax = ch->xMax;
522     yMin = ch->yMin;
523     yMax = ch->yMax;
524     ch = (TextChar *)chars->get(start + len - 1);
525     xMin = ch->xMin;
526     break;
527   case 3:
528     ch = (TextChar *)chars->get(start);
529     xMin = ch->xMin;
530     xMax = ch->xMax;
531     yMax = ch->yMax;
532     ch = (TextChar *)chars->get(start + len - 1);
533     yMin = ch->yMin;
534     break;
535   }
536   for (i = 0; i < len; ++i) {
537     ch = (TextChar *)chars->get(rot >= 2 ? start + len - 1 - i : start + i);
538     text[i] = ch->c;
539     charPos[i] = ch->charPos;
540     if (i == len - 1) {
541       charPos[len] = ch->charPos + ch->charLen;
542     }
543     switch (rot) {
544     case 0:
545     default:
546       edge[i] = ch->xMin;
547       if (i == len - 1) {
548 	edge[len] = ch->xMax;
549       }
550       break;
551     case 1:
552       edge[i] = ch->yMin;
553       if (i == len - 1) {
554 	edge[len] = ch->yMax;
555       }
556       break;
557     case 2:
558       edge[i] = ch->xMax;
559       if (i == len - 1) {
560 	edge[len] = ch->xMin;
561       }
562       break;
563     case 3:
564       edge[i] = ch->yMax;
565       if (i == len - 1) {
566 	edge[len] = ch->yMin;
567       }
568       break;
569     }
570   }
571   ch = (TextChar *)chars->get(start);
572   font = ch->font;
573   fontSize = ch->fontSize;
574   spaceAfter = spaceAfterA;
575   underlined = gFalse;
576   link = NULL;
577   colorR = ch->colorR;
578   colorG = ch->colorG;
579   colorB = ch->colorB;
580   invisible = ch->invisible;
581 }
582 
TextWord(TextWord * word)583 TextWord::TextWord(TextWord *word) {
584   *this = *word;
585   text = (Unicode *)gmallocn(len, sizeof(Unicode));
586   memcpy(text, word->text, len * sizeof(Unicode));
587   edge = (double *)gmallocn(len + 1, sizeof(double));
588   memcpy(edge, word->edge, (len + 1) * sizeof(double));
589   charPos = (int *)gmallocn(len + 1, sizeof(int));
590   memcpy(charPos, word->charPos, (len + 1) * sizeof(int));
591 }
592 
~TextWord()593 TextWord::~TextWord() {
594   gfree(text);
595   gfree(edge);
596   gfree(charPos);
597 }
598 
599 // This is used to append a clipped character to a word.
appendChar(TextChar * ch)600 void TextWord::appendChar(TextChar *ch) {
601   if (ch->xMin < xMin) {
602     xMin = ch->xMin;
603   }
604   if (ch->xMax > xMax) {
605     xMax = ch->xMax;
606   }
607   if (ch->yMin < yMin) {
608     yMin = ch->yMin;
609   }
610   if (ch->yMax > yMax) {
611     yMax = ch->yMax;
612   }
613   text = (Unicode *)greallocn(text, len + 1, sizeof(Unicode));
614   edge = (double *)greallocn(edge, len + 2, sizeof(double));
615   charPos = (int *)greallocn(charPos, len + 2, sizeof(int));
616   text[len] = ch->c;
617   charPos[len] = ch->charPos;
618   charPos[len+1] = ch->charPos + ch->charLen;
619   switch (rot) {
620   case 0:
621   default:
622     edge[len] = ch->xMin;
623     edge[len+1] = ch->xMax;
624     break;
625   case 1:
626     edge[len] = ch->yMin;
627     edge[len+1] = ch->yMax;
628     break;
629   case 2:
630     edge[len] = ch->xMax;
631     edge[len+1] = ch->xMin;
632     break;
633   case 3:
634     edge[len] = ch->yMax;
635     edge[len+1] = ch->yMin;
636     break;
637   }
638   ++len;
639 }
640 
cmpYX(const void * p1,const void * p2)641 int TextWord::cmpYX(const void *p1, const void *p2) {
642   const TextWord *word1 = *(const TextWord **)p1;
643   const TextWord *word2 = *(const TextWord **)p2;
644   double cmp;
645 
646   if ((cmp = word1->yMin - word2->yMin) == 0) {
647     cmp = word1->xMin - word2->xMin;
648   }
649   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
650 }
651 
cmpCharPos(const void * p1,const void * p2)652 int TextWord::cmpCharPos(const void *p1, const void *p2) {
653   const TextWord *word1 = *(const TextWord **)p1;
654   const TextWord *word2 = *(const TextWord **)p2;
655 
656   return word1->charPos[0] - word2->charPos[0];
657 }
658 
getText()659 GString *TextWord::getText() {
660   GString *s;
661   UnicodeMap *uMap;
662   char buf[8];
663   int n, i;
664 
665   s = new GString();
666   if (!(uMap = globalParams->getTextEncoding())) {
667     return s;
668   }
669   for (i = 0; i < len; ++i) {
670     n = uMap->mapUnicode(text[i], buf, sizeof(buf));
671     s->append(buf, n);
672   }
673   uMap->decRefCnt();
674   return s;
675 }
676 
getCharBBox(int charIdx,double * xMinA,double * yMinA,double * xMaxA,double * yMaxA)677 void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA,
678 			   double *xMaxA, double *yMaxA) {
679   if (charIdx < 0 || charIdx >= len) {
680     return;
681   }
682   switch (rot) {
683   case 0:
684     *xMinA = edge[charIdx];
685     *xMaxA = edge[charIdx + 1];
686     *yMinA = yMin;
687     *yMaxA = yMax;
688     break;
689   case 1:
690     *xMinA = xMin;
691     *xMaxA = xMax;
692     *yMinA = edge[charIdx];
693     *yMaxA = edge[charIdx + 1];
694     break;
695   case 2:
696     *xMinA = edge[charIdx + 1];
697     *xMaxA = edge[charIdx];
698     *yMinA = yMin;
699     *yMaxA = yMax;
700     break;
701   case 3:
702     *xMinA = xMin;
703     *xMaxA = xMax;
704     *yMinA = edge[charIdx + 1];
705     *yMaxA = edge[charIdx];
706     break;
707   }
708 }
709 
getBaseline()710 double TextWord::getBaseline() {
711   switch (rot) {
712   case 0:
713   default:
714     return yMax + fontSize * font->descent;
715   case 1:
716     return xMin - fontSize * font->descent;
717   case 2:
718     return yMin - fontSize * font->descent;
719   case 3:
720     return xMax + fontSize * font->descent;
721   }
722 }
723 
getLinkURI()724 GString *TextWord::getLinkURI() {
725   return link ? link->uri : (GString *)NULL;
726 }
727 
728 //------------------------------------------------------------------------
729 // TextLine
730 //------------------------------------------------------------------------
731 
TextLine(GList * wordsA,double xMinA,double yMinA,double xMaxA,double yMaxA,double fontSizeA)732 TextLine::TextLine(GList *wordsA, double xMinA, double yMinA,
733 		   double xMaxA, double yMaxA, double fontSizeA) {
734   TextWord *word;
735   int i, j, k;
736 
737   words = wordsA;
738   rot = 0;
739   xMin = xMinA;
740   yMin = yMinA;
741   xMax = xMaxA;
742   yMax = yMaxA;
743   fontSize = fontSizeA;
744   px = 0;
745   pw = 0;
746 
747   // build the text
748   len = 0;
749   for (i = 0; i < words->getLength(); ++i) {
750     word = (TextWord *)words->get(i);
751     len += word->len;
752     if (word->spaceAfter) {
753       ++len;
754     }
755   }
756   text = (Unicode *)gmallocn(len, sizeof(Unicode));
757   edge = (double *)gmallocn(len + 1, sizeof(double));
758   j = 0;
759   for (i = 0; i < words->getLength(); ++i) {
760     word = (TextWord *)words->get(i);
761     if (i == 0) {
762       rot = word->rot;
763     }
764     for (k = 0; k < word->len; ++k) {
765       text[j] = word->text[k];
766       edge[j] = word->edge[k];
767       ++j;
768     }
769     edge[j] = word->edge[word->len];
770     if (word->spaceAfter) {
771       text[j] = (Unicode)0x0020;
772       ++j;
773       edge[j] = edge[j - 1];
774     }
775   }
776   //~ need to check for other Unicode chars used as hyphens
777   hyphenated = text[len - 1] == (Unicode)'-';
778 }
779 
~TextLine()780 TextLine::~TextLine() {
781   deleteGList(words, TextWord);
782   gfree(text);
783   gfree(edge);
784 }
785 
getBaseline()786 double TextLine::getBaseline() {
787   TextWord *word0;
788 
789   word0 = (TextWord *)words->get(0);
790   switch (rot) {
791   case 0:
792   default:
793     return yMax + fontSize * word0->font->descent;
794   case 1:
795     return xMin - fontSize * word0->font->descent;
796   case 2:
797     return yMin - fontSize * word0->font->descent;
798   case 3:
799     return xMax + fontSize * word0->font->descent;
800   }
801 }
802 
803 //------------------------------------------------------------------------
804 // TextParagraph
805 //------------------------------------------------------------------------
806 
TextParagraph(GList * linesA)807 TextParagraph::TextParagraph(GList *linesA) {
808   TextLine *line;
809   int i;
810 
811   lines = linesA;
812   xMin = yMin = xMax = yMax = 0;
813   for (i = 0; i < lines->getLength(); ++i) {
814     line = (TextLine *)lines->get(i);
815     if (i == 0 || line->xMin < xMin) {
816       xMin = line->xMin;
817     }
818     if (i == 0 || line->yMin < yMin) {
819       yMin = line->yMin;
820     }
821     if (i == 0 || line->xMax > xMax) {
822       xMax = line->xMax;
823     }
824     if (i == 0 || line->yMax > yMax) {
825       yMax = line->yMax;
826     }
827   }
828 }
829 
~TextParagraph()830 TextParagraph::~TextParagraph() {
831   deleteGList(lines, TextLine);
832 }
833 
834 //------------------------------------------------------------------------
835 // TextColumn
836 //------------------------------------------------------------------------
837 
TextColumn(GList * paragraphsA,double xMinA,double yMinA,double xMaxA,double yMaxA)838 TextColumn::TextColumn(GList *paragraphsA, double xMinA, double yMinA,
839 		       double xMaxA, double yMaxA) {
840   paragraphs = paragraphsA;
841   xMin = xMinA;
842   yMin = yMinA;
843   xMax = xMaxA;
844   yMax = yMaxA;
845   px = py = 0;
846   pw = ph = 0;
847 }
848 
~TextColumn()849 TextColumn::~TextColumn() {
850   deleteGList(paragraphs, TextParagraph);
851 }
852 
cmpX(const void * p1,const void * p2)853 int TextColumn::cmpX(const void *p1, const void *p2) {
854   const TextColumn *col1 = *(const TextColumn **)p1;
855   const TextColumn *col2 = *(const TextColumn **)p2;
856 
857   if (col1->xMin < col2->xMin) {
858     return -1;
859   } else if (col1->xMin > col2->xMin) {
860     return 1;
861   } else {
862     return 0;
863   }
864 }
865 
cmpY(const void * p1,const void * p2)866 int TextColumn::cmpY(const void *p1, const void *p2) {
867   const TextColumn *col1 = *(const TextColumn **)p1;
868   const TextColumn *col2 = *(const TextColumn **)p2;
869 
870   if (col1->yMin < col2->yMin) {
871     return -1;
872   } else if (col1->yMin > col2->yMin) {
873     return 1;
874   } else {
875     return 0;
876   }
877 }
878 
cmpPX(const void * p1,const void * p2)879 int TextColumn::cmpPX(const void *p1, const void *p2) {
880   const TextColumn *col1 = *(const TextColumn **)p1;
881   const TextColumn *col2 = *(const TextColumn **)p2;
882 
883   if (col1->px < col2->px) {
884     return -1;
885   } else if (col1->px > col2->px) {
886     return 1;
887   } else {
888     return 0;
889   }
890 }
891 
892 //------------------------------------------------------------------------
893 // TextWordList
894 //------------------------------------------------------------------------
895 
TextWordList(GList * wordsA)896 TextWordList::TextWordList(GList *wordsA) {
897   words = wordsA;
898 }
899 
~TextWordList()900 TextWordList::~TextWordList() {
901   deleteGList(words, TextWord);
902 }
903 
getLength()904 int TextWordList::getLength() {
905   return words->getLength();
906 }
907 
get(int idx)908 TextWord *TextWordList::get(int idx) {
909   if (idx < 0 || idx >= words->getLength()) {
910     return NULL;
911   }
912   return (TextWord *)words->get(idx);
913 }
914 
915 //------------------------------------------------------------------------
916 // TextPage
917 //------------------------------------------------------------------------
918 
TextPage(TextOutputControl * controlA)919 TextPage::TextPage(TextOutputControl *controlA) {
920   control = *controlA;
921   pageWidth = pageHeight = 0;
922   charPos = 0;
923   curFont = NULL;
924   curFontSize = 0;
925   curRot = 0;
926   nTinyChars = 0;
927   actualText = NULL;
928   actualTextLen = 0;
929   actualTextX0 = 0;
930   actualTextY0 = 0;
931   actualTextX1 = 0;
932   actualTextY1 = 0;
933   actualTextNBytes = 0;
934 
935   chars = new GList();
936   fonts = new GList();
937 
938   underlines = new GList();
939   links = new GList();
940 
941   findCols = NULL;
942   findLR = gTrue;
943   lastFindXMin = lastFindYMin = 0;
944   haveLastFind = gFalse;
945 }
946 
~TextPage()947 TextPage::~TextPage() {
948   clear();
949   deleteGList(chars, TextChar);
950   deleteGList(fonts, TextFontInfo);
951   deleteGList(underlines, TextUnderline);
952   deleteGList(links, TextLink);
953   if (findCols) {
954     deleteGList(findCols, TextColumn);
955   }
956 }
957 
startPage(GfxState * state)958 void TextPage::startPage(GfxState *state) {
959   clear();
960   if (state) {
961     pageWidth = state->getPageWidth();
962     pageHeight = state->getPageHeight();
963   } else {
964     pageWidth = pageHeight = 0;
965   }
966 }
967 
clear()968 void TextPage::clear() {
969   pageWidth = pageHeight = 0;
970   charPos = 0;
971   curFont = NULL;
972   curFontSize = 0;
973   curRot = 0;
974   nTinyChars = 0;
975   gfree(actualText);
976   actualText = NULL;
977   actualTextLen = 0;
978   actualTextNBytes = 0;
979   deleteGList(chars, TextChar);
980   chars = new GList();
981   deleteGList(fonts, TextFontInfo);
982   fonts = new GList();
983   deleteGList(underlines, TextUnderline);
984   underlines = new GList();
985   deleteGList(links, TextLink);
986   links = new GList();
987 
988   if (findCols) {
989     deleteGList(findCols, TextColumn);
990     findCols = NULL;
991   }
992   findLR = gTrue;
993   lastFindXMin = lastFindYMin = 0;
994   haveLastFind = gFalse;
995 }
996 
updateFont(GfxState * state)997 void TextPage::updateFont(GfxState *state) {
998   GfxFont *gfxFont;
999   double *fm;
1000   char *name;
1001   int code, mCode, letterCode, anyCode;
1002   double w;
1003   double m[4], m2[4];
1004   int i;
1005 
1006   // get the font info object
1007   curFont = NULL;
1008   for (i = 0; i < fonts->getLength(); ++i) {
1009     curFont = (TextFontInfo *)fonts->get(i);
1010     if (curFont->matches(state)) {
1011       break;
1012     }
1013     curFont = NULL;
1014   }
1015   if (!curFont) {
1016     curFont = new TextFontInfo(state);
1017     fonts->append(curFont);
1018   }
1019 
1020   // adjust the font size
1021   gfxFont = state->getFont();
1022   curFontSize = state->getTransformedFontSize();
1023   if (gfxFont && gfxFont->getType() == fontType3) {
1024     // This is a hack which makes it possible to deal with some Type 3
1025     // fonts.  The problem is that it's impossible to know what the
1026     // base coordinate system used in the font is without actually
1027     // rendering the font.  This code tries to guess by looking at the
1028     // width of the character 'm' (which breaks if the font is a
1029     // subset that doesn't contain 'm').
1030     mCode = letterCode = anyCode = -1;
1031     for (code = 0; code < 256; ++code) {
1032       name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
1033       if (name && name[0] == 'm' && name[1] == '\0') {
1034 	mCode = code;
1035       }
1036       if (letterCode < 0 && name && name[1] == '\0' &&
1037 	  ((name[0] >= 'A' && name[0] <= 'Z') ||
1038 	   (name[0] >= 'a' && name[0] <= 'z'))) {
1039 	letterCode = code;
1040       }
1041       if (anyCode < 0 && name &&
1042 	  ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
1043 	anyCode = code;
1044       }
1045     }
1046     if (mCode >= 0 &&
1047 	(w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
1048       // 0.6 is a generic average 'm' width -- yes, this is a hack
1049       curFontSize *= w / 0.6;
1050     } else if (letterCode >= 0 &&
1051 	       (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
1052       // even more of a hack: 0.5 is a generic letter width
1053       curFontSize *= w / 0.5;
1054     } else if (anyCode >= 0 &&
1055 	       (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
1056       // better than nothing: 0.5 is a generic character width
1057       curFontSize *= w / 0.5;
1058     }
1059     fm = gfxFont->getFontMatrix();
1060     if (fm[0] != 0) {
1061       curFontSize *= fabs(fm[3] / fm[0]);
1062     }
1063   }
1064 
1065   // compute the rotation
1066   state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
1067   if (gfxFont && gfxFont->getType() == fontType3) {
1068     fm = gfxFont->getFontMatrix();
1069     m2[0] = fm[0] * m[0] + fm[1] * m[2];
1070     m2[1] = fm[0] * m[1] + fm[1] * m[3];
1071     m2[2] = fm[2] * m[0] + fm[3] * m[2];
1072     m2[3] = fm[2] * m[1] + fm[3] * m[3];
1073     m[0] = m2[0];
1074     m[1] = m2[1];
1075     m[2] = m2[2];
1076     m[3] = m2[3];
1077   }
1078   if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
1079     curRot = (m[0] > 0 || m[3] < 0) ? 0 : 2;
1080   } else {
1081     curRot = (m[2] > 0) ? 1 : 3;
1082   }
1083 }
1084 
addChar(GfxState * state,double x,double y,double dx,double dy,CharCode c,int nBytes,Unicode * u,int uLen)1085 void TextPage::addChar(GfxState *state, double x, double y,
1086 		       double dx, double dy,
1087 		       CharCode c, int nBytes, Unicode *u, int uLen) {
1088   double x1, y1, x2, y2, w1, h1, dx2, dy2, ascent, descent, sp;
1089   double xMin, yMin, xMax, yMax;
1090   double clipXMin, clipYMin, clipXMax, clipYMax;
1091   GfxRGB rgb;
1092   GBool clipped, rtl;
1093   int i, j;
1094 
1095   // if we're in an ActualText span, save the position info (the
1096   // ActualText chars will be added by TextPage::endActualText()).
1097   if (actualText) {
1098     if (!actualTextNBytes) {
1099       actualTextX0 = x;
1100       actualTextY0 = y;
1101     }
1102     actualTextX1 = x + dx;
1103     actualTextY1 = y + dy;
1104     actualTextNBytes += nBytes;
1105     return;
1106   }
1107 
1108   // subtract char and word spacing from the dx,dy values
1109   sp = state->getCharSpace();
1110   if (c == (CharCode)0x20) {
1111     sp += state->getWordSpace();
1112   }
1113   state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
1114   dx -= dx2;
1115   dy -= dy2;
1116   state->transformDelta(dx, dy, &w1, &h1);
1117 
1118   // throw away chars that aren't inside the page bounds
1119   // (and also do a sanity check on the character size)
1120   state->transform(x, y, &x1, &y1);
1121   if (x1 + w1 < 0 || x1 > pageWidth ||
1122       y1 + h1 < 0 || y1 > pageHeight ||
1123       w1 > pageWidth || h1 > pageHeight) {
1124     charPos += nBytes;
1125     return;
1126   }
1127 
1128   // check the tiny chars limit
1129   if (!globalParams->getTextKeepTinyChars() &&
1130       fabs(w1) < 3 && fabs(h1) < 3) {
1131     if (++nTinyChars > 50000) {
1132       charPos += nBytes;
1133       return;
1134     }
1135   }
1136 
1137   // skip space characters
1138   if (uLen == 1 && u[0] == (Unicode)0x20) {
1139     charPos += nBytes;
1140     return;
1141   }
1142 
1143   // check for clipping
1144   clipped = gFalse;
1145   if (control.clipText) {
1146     state->getClipBBox(&clipXMin, &clipYMin, &clipXMax, &clipYMax);
1147     if (x1 + 0.1 * w1 < clipXMin || x1 + 0.9 * w1 > clipXMax ||
1148 	y1 + 0.1 * h1 < clipYMin || y1 + 0.9 * h1 > clipYMax) {
1149       clipped = gTrue;
1150     }
1151   }
1152 
1153   // add the characters
1154   if (uLen > 0) {
1155 
1156     // handle right-to-left ligatures: if there are multiple Unicode
1157     // characters, and they're all right-to-left, insert them in
1158     // right-to-left order
1159     if (uLen > 1) {
1160       rtl = gTrue;
1161       for (i = 0; i < uLen; ++i) {
1162 	if (!unicodeTypeR(u[i])) {
1163 	  rtl = gFalse;
1164 	  break;
1165 	}
1166       }
1167     } else {
1168       rtl = gFalse;
1169     }
1170 
1171     w1 /= uLen;
1172     h1 /= uLen;
1173     ascent = curFont->ascent * curFontSize;
1174     descent = curFont->descent * curFontSize;
1175     for (i = 0; i < uLen; ++i) {
1176       x2 = x1 + i * w1;
1177       y2 = y1 + i * h1;
1178       switch (curRot) {
1179       case 0:
1180       default:
1181 	xMin = x2;
1182 	xMax = x2 + w1;
1183 	yMin = y2 - ascent;
1184 	yMax = y2 - descent;
1185 	break;
1186       case 1:
1187 	xMin = x2 + descent;
1188 	xMax = x2 + ascent;
1189 	yMin = y2;
1190 	yMax = y2 + h1;
1191 	break;
1192       case 2:
1193 	xMin = x2 + w1;
1194 	xMax = x2;
1195 	yMin = y2 + descent;
1196 	yMax = y2 + ascent;
1197 	break;
1198       case 3:
1199 	xMin = x2 - ascent;
1200 	xMax = x2 - descent;
1201 	yMin = y2 + h1;
1202 	yMax = y2;
1203 	break;
1204       }
1205       if ((state->getRender() & 3) == 1) {
1206 	state->getStrokeRGB(&rgb);
1207       } else {
1208 	state->getFillRGB(&rgb);
1209       }
1210       if (rtl) {
1211 	j = uLen - 1 - i;
1212       } else {
1213 	j = i;
1214       }
1215       chars->append(new TextChar(u[j], charPos, nBytes, xMin, yMin, xMax, yMax,
1216 				 curRot, clipped,
1217 				 state->getRender() == 3,
1218 				 curFont, curFontSize,
1219 				 colToDbl(rgb.r), colToDbl(rgb.g),
1220 				 colToDbl(rgb.b)));
1221     }
1222   }
1223 
1224   charPos += nBytes;
1225 }
1226 
incCharCount(int nChars)1227 void TextPage::incCharCount(int nChars) {
1228   charPos += nChars;
1229 }
1230 
beginActualText(GfxState * state,Unicode * u,int uLen)1231 void TextPage::beginActualText(GfxState *state, Unicode *u, int uLen) {
1232   if (actualText) {
1233     gfree(actualText);
1234   }
1235   actualText = (Unicode *)gmallocn(uLen, sizeof(Unicode));
1236   memcpy(actualText, u, uLen * sizeof(Unicode));
1237   actualTextLen = uLen;
1238   actualTextNBytes = 0;
1239 }
1240 
endActualText(GfxState * state)1241 void TextPage::endActualText(GfxState *state) {
1242   Unicode *u;
1243 
1244   u = actualText;
1245   actualText = NULL;  // so we can call TextPage::addChar()
1246   if (actualTextNBytes) {
1247     // now that we have the position info for all of the text inside
1248     // the marked content span, we feed the "ActualText" back through
1249     // addChar()
1250     addChar(state, actualTextX0, actualTextY0,
1251 	    actualTextX1 - actualTextX0, actualTextY1 - actualTextY0,
1252 	    0, actualTextNBytes, u, actualTextLen);
1253   }
1254   gfree(u);
1255   actualText = NULL;
1256   actualTextLen = 0;
1257   actualTextNBytes = gFalse;
1258 }
1259 
addUnderline(double x0,double y0,double x1,double y1)1260 void TextPage::addUnderline(double x0, double y0, double x1, double y1) {
1261   underlines->append(new TextUnderline(x0, y0, x1, y1));
1262 }
1263 
addLink(double xMin,double yMin,double xMax,double yMax,Link * link)1264 void TextPage::addLink(double xMin, double yMin, double xMax, double yMax,
1265 		       Link *link) {
1266   GString *uri;
1267 
1268   if (link && link->getAction() && link->getAction()->getKind() == actionURI) {
1269     uri = ((LinkURI *)link->getAction())->getURI()->copy();
1270     links->append(new TextLink(xMin, yMin, xMax, yMax, uri));
1271   }
1272 }
1273 
1274 //------------------------------------------------------------------------
1275 // TextPage: output
1276 //------------------------------------------------------------------------
1277 
write(void * outputStream,TextOutputFunc outputFunc)1278 void TextPage::write(void *outputStream, TextOutputFunc outputFunc) {
1279   UnicodeMap *uMap;
1280   char space[8], eol[16], eop[8];
1281   int spaceLen, eolLen, eopLen;
1282   GBool pageBreaks;
1283 
1284   // get the output encoding
1285   if (!(uMap = globalParams->getTextEncoding())) {
1286     return;
1287   }
1288   spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
1289   eolLen = 0; // make gcc happy
1290   switch (globalParams->getTextEOL()) {
1291   case eolUnix:
1292     eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
1293     break;
1294   case eolDOS:
1295     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
1296     eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
1297     break;
1298   case eolMac:
1299     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
1300     break;
1301   }
1302   eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
1303   pageBreaks = globalParams->getTextPageBreaks();
1304 
1305   switch (control.mode) {
1306   case textOutReadingOrder:
1307     writeReadingOrder(outputStream, outputFunc, uMap, space, spaceLen,
1308 		      eol, eolLen);
1309     break;
1310   case textOutPhysLayout:
1311   case textOutTableLayout:
1312     writePhysLayout(outputStream, outputFunc, uMap, space, spaceLen,
1313 		    eol, eolLen);
1314     break;
1315   case textOutLinePrinter:
1316     writeLinePrinter(outputStream, outputFunc, uMap, space, spaceLen,
1317 		     eol, eolLen);
1318     break;
1319   case textOutRawOrder:
1320     writeRaw(outputStream, outputFunc, uMap, space, spaceLen,
1321 	     eol, eolLen);
1322     break;
1323   }
1324 
1325   // end of page
1326   if (pageBreaks) {
1327     (*outputFunc)(outputStream, eop, eopLen);
1328   }
1329 
1330   uMap->decRefCnt();
1331 }
1332 
writeReadingOrder(void * outputStream,TextOutputFunc outputFunc,UnicodeMap * uMap,char * space,int spaceLen,char * eol,int eolLen)1333 void TextPage::writeReadingOrder(void *outputStream,
1334 				 TextOutputFunc outputFunc,
1335 				 UnicodeMap *uMap,
1336 				 char *space, int spaceLen,
1337 				 char *eol, int eolLen) {
1338   TextBlock *tree;
1339   TextColumn *col;
1340   TextParagraph *par;
1341   TextLine *line;
1342   GList *columns;
1343   GBool primaryLR;
1344   GString *s;
1345   int colIdx, parIdx, lineIdx, rot, n;
1346 
1347   rot = rotateChars(chars);
1348   primaryLR = checkPrimaryLR(chars);
1349   tree = splitChars(chars);
1350 #if 0 //~debug
1351   dumpTree(tree);
1352 #endif
1353   if (!tree) {
1354     // no text
1355     unrotateChars(chars, rot);
1356     return;
1357   }
1358   columns = buildColumns(tree);
1359   delete tree;
1360   unrotateChars(chars, rot);
1361   if (control.html) {
1362     rotateUnderlinesAndLinks(rot);
1363     generateUnderlinesAndLinks(columns);
1364   }
1365 #if 0 //~debug
1366   dumpColumns(columns);
1367 #endif
1368 
1369   for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
1370     col = (TextColumn *)columns->get(colIdx);
1371     for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
1372       par = (TextParagraph *)col->paragraphs->get(parIdx);
1373       for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
1374 	line = (TextLine *)par->lines->get(lineIdx);
1375 	n = line->len;
1376 	if (line->hyphenated && lineIdx + 1 < par->lines->getLength()) {
1377 	  --n;
1378 	}
1379 	s = new GString();
1380 	encodeFragment(line->text, n, uMap, primaryLR, s);
1381 	if (lineIdx + 1 < par->lines->getLength() && !line->hyphenated) {
1382 	  s->append(space, spaceLen);
1383 	}
1384 	(*outputFunc)(outputStream, s->getCString(), s->getLength());
1385 	delete s;
1386       }
1387       (*outputFunc)(outputStream, eol, eolLen);
1388     }
1389     (*outputFunc)(outputStream, eol, eolLen);
1390   }
1391 
1392   deleteGList(columns, TextColumn);
1393 }
1394 
makeColumns()1395 GList *TextPage::makeColumns() {
1396   TextBlock *tree;
1397   GList *columns;
1398 
1399   tree = splitChars(chars);
1400   if (!tree) {
1401     // no text
1402     return new GList();
1403   }
1404   columns = buildColumns(tree);
1405   delete tree;
1406   if (control.html) {
1407     generateUnderlinesAndLinks(columns);
1408   }
1409   return columns;
1410 }
1411 
1412 // This handles both physical layout and table layout modes.
writePhysLayout(void * outputStream,TextOutputFunc outputFunc,UnicodeMap * uMap,char * space,int spaceLen,char * eol,int eolLen)1413 void TextPage::writePhysLayout(void *outputStream,
1414 			       TextOutputFunc outputFunc,
1415 			       UnicodeMap *uMap,
1416 			       char *space, int spaceLen,
1417 			       char *eol, int eolLen) {
1418   TextBlock *tree;
1419   GString **out;
1420   int *outLen;
1421   TextColumn *col;
1422   TextParagraph *par;
1423   TextLine *line;
1424   GList *columns;
1425   GBool primaryLR;
1426   int ph, colIdx, parIdx, lineIdx, rot, y, i;
1427 
1428 #if 0 //~debug
1429   dumpChars(chars);
1430 #endif
1431   rot = rotateChars(chars);
1432   primaryLR = checkPrimaryLR(chars);
1433   tree = splitChars(chars);
1434 #if 0 //~debug
1435   dumpTree(tree);
1436 #endif
1437   if (!tree) {
1438     // no text
1439     unrotateChars(chars, rot);
1440     return;
1441   }
1442   columns = buildColumns(tree);
1443   delete tree;
1444   unrotateChars(chars, rot);
1445   if (control.html) {
1446     rotateUnderlinesAndLinks(rot);
1447     generateUnderlinesAndLinks(columns);
1448   }
1449   ph = assignPhysLayoutPositions(columns);
1450 #if 0 //~debug
1451   dumpColumns(columns);
1452 #endif
1453 
1454   out = (GString **)gmallocn(ph, sizeof(GString *));
1455   outLen = (int *)gmallocn(ph, sizeof(int));
1456   for (i = 0; i < ph; ++i) {
1457     out[i] = NULL;
1458     outLen[i] = 0;
1459   }
1460 
1461   columns->sort(&TextColumn::cmpPX);
1462   for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
1463     col = (TextColumn *)columns->get(colIdx);
1464     y = col->py;
1465     for (parIdx = 0;
1466 	 parIdx < col->paragraphs->getLength() && y < ph;
1467 	 ++parIdx) {
1468       par = (TextParagraph *)col->paragraphs->get(parIdx);
1469       for (lineIdx = 0;
1470 	   lineIdx < par->lines->getLength() && y < ph;
1471 	   ++lineIdx) {
1472 	line = (TextLine *)par->lines->get(lineIdx);
1473 	if (!out[y]) {
1474 	  out[y] = new GString();
1475 	}
1476 	while (outLen[y] < col->px + line->px) {
1477 	  out[y]->append(space, spaceLen);
1478 	  ++outLen[y];
1479 	}
1480 	encodeFragment(line->text, line->len, uMap, primaryLR, out[y]);
1481 	outLen[y] += line->pw;
1482 	++y;
1483       }
1484       if (parIdx + 1 < col->paragraphs->getLength()) {
1485 	++y;
1486       }
1487     }
1488   }
1489 
1490   for (i = 0; i < ph; ++i) {
1491     if (out[i]) {
1492       (*outputFunc)(outputStream, out[i]->getCString(), out[i]->getLength());
1493       delete out[i];
1494     }
1495     (*outputFunc)(outputStream, eol, eolLen);
1496   }
1497 
1498   gfree(out);
1499   gfree(outLen);
1500 
1501   deleteGList(columns, TextColumn);
1502 }
1503 
writeLinePrinter(void * outputStream,TextOutputFunc outputFunc,UnicodeMap * uMap,char * space,int spaceLen,char * eol,int eolLen)1504 void TextPage::writeLinePrinter(void *outputStream,
1505 				TextOutputFunc outputFunc,
1506 				UnicodeMap *uMap,
1507 				char *space, int spaceLen,
1508 				char *eol, int eolLen) {
1509   TextChar *ch, *ch2;
1510   GList *line;
1511   GString *s;
1512   char buf[8];
1513   double pitch, lineSpacing, delta;
1514   double yMin0, yShift, xMin0, xShift;
1515   double y, x;
1516   int rot, n, i, j, k;
1517 
1518   rot = rotateChars(chars);
1519   chars->sort(&TextChar::cmpX);
1520   removeDuplicates(chars, 0);
1521   chars->sort(&TextChar::cmpY);
1522 
1523   // get character pitch
1524   if (control.fixedPitch > 0) {
1525     pitch = control.fixedPitch;
1526   } else {
1527     // compute (approximate) character pitch
1528     pitch = pageWidth;
1529     for (i = 0; i < chars->getLength(); ++i) {
1530       ch = (TextChar *)chars->get(i);
1531       for (j = i + 1; j < chars->getLength(); ++j) {
1532 	ch2 = (TextChar *)chars->get(j);
1533 	if (ch2->yMin + ascentAdjustFactor * (ch2->yMax - ch2->yMin) <
1534 	      ch->yMax - descentAdjustFactor * (ch->yMax - ch->yMin) &&
1535 	    ch->yMin + ascentAdjustFactor * (ch->yMax - ch->yMin) <
1536 	      ch2->yMax - descentAdjustFactor * (ch2->yMax - ch2->yMin)) {
1537 	  delta = fabs(ch2->xMin - ch->xMin);
1538 	  if (delta > 0 && delta < pitch) {
1539 	    pitch = delta;
1540 	  }
1541 	}
1542       }
1543     }
1544   }
1545 
1546   // get line spacing
1547   if (control.fixedLineSpacing > 0) {
1548     lineSpacing = control.fixedLineSpacing;
1549   } else {
1550     // compute (approximate) line spacing
1551     lineSpacing = pageHeight;
1552     i = 0;
1553     while (i < chars->getLength()) {
1554       ch = (TextChar *)chars->get(i);
1555       // look for the first char that does not (substantially)
1556       // vertically overlap this one
1557       delta = 0;
1558       for (++i; delta == 0 && i < chars->getLength(); ++i) {
1559 	ch2 = (TextChar *)chars->get(i);
1560 	if (ch2->yMin + ascentAdjustFactor * (ch2->yMax - ch2->yMin) >
1561 	    ch->yMax - descentAdjustFactor * (ch->yMax - ch->yMin)) {
1562 	  delta = ch2->yMin - ch->yMin;
1563 	}
1564       }
1565       if (delta > 0 && delta < lineSpacing) {
1566 	lineSpacing = delta;
1567       }
1568     }
1569   }
1570 
1571   // shift the grid to avoid problems with floating point accuracy --
1572   // for fixed line spacing, this avoids problems with
1573   // dropping/inserting blank lines
1574   if (chars->getLength()) {
1575     yMin0 = ((TextChar *)chars->get(0))->yMin;
1576     yShift = yMin0 - (int)(yMin0 / lineSpacing + 0.5) * lineSpacing
1577              - 0.5 * lineSpacing;
1578   } else {
1579     yShift = 0;
1580   }
1581 
1582   // for each line...
1583   i = 0;
1584   j = chars->getLength() - 1;
1585   for (y = yShift; y < pageHeight; y += lineSpacing) {
1586 
1587     // get the characters in this line
1588     line = new GList;
1589     while (i < chars->getLength() &&
1590 	   ((TextChar *)chars->get(i))->yMin < y + lineSpacing) {
1591       line->append(chars->get(i++));
1592     }
1593     line->sort(&TextChar::cmpX);
1594 
1595     // shift the grid to avoid problems with floating point accuracy
1596     // -- for fixed char spacing, this avoids problems with
1597     // dropping/inserting spaces
1598     if (line->getLength()) {
1599       xMin0 = ((TextChar *)line->get(0))->xMin;
1600       xShift = xMin0 - (int)(xMin0 / pitch + 0.5) * pitch - 0.5 * pitch;
1601     } else {
1602       xShift = 0;
1603     }
1604 
1605     // write the line
1606     s = new GString();
1607     x = xShift;
1608     k = 0;
1609     while (k < line->getLength()) {
1610       ch = (TextChar *)line->get(k);
1611       if (ch->xMin < x + pitch) {
1612 	n = uMap->mapUnicode(ch->c, buf, sizeof(buf));
1613 	s->append(buf, n);
1614 	++k;
1615       } else {
1616 	s->append(space, spaceLen);
1617 	n = spaceLen;
1618       }
1619       x += (uMap->isUnicode() ? 1 : n) * pitch;
1620     }
1621     s->append(eol, eolLen);
1622     (*outputFunc)(outputStream, s->getCString(), s->getLength());
1623     delete s;
1624     delete line;
1625   }
1626 
1627   unrotateChars(chars, rot);
1628 }
1629 
writeRaw(void * outputStream,TextOutputFunc outputFunc,UnicodeMap * uMap,char * space,int spaceLen,char * eol,int eolLen)1630 void TextPage::writeRaw(void *outputStream,
1631 			TextOutputFunc outputFunc,
1632 			UnicodeMap *uMap,
1633 			char *space, int spaceLen,
1634 			char *eol, int eolLen) {
1635   TextChar *ch, *ch2;
1636   GString *s;
1637   char buf[8];
1638   int n, i;
1639 
1640   s = new GString();
1641 
1642   for (i = 0; i < chars->getLength(); ++i) {
1643 
1644     // process one char
1645     ch = (TextChar *)chars->get(i);
1646     n = uMap->mapUnicode(ch->c, buf, sizeof(buf));
1647     s->append(buf, n);
1648 
1649     // check for space or eol
1650     if (i+1 < chars->getLength()) {
1651       ch2 = (TextChar *)chars->get(i+1);
1652       if (ch2->rot != ch->rot) {
1653 	s->append(eol, eolLen);
1654       } else {
1655 	switch (ch->rot) {
1656 	case 0:
1657 	default:
1658 	  if (fabs(ch2->yMin - ch->yMin) > rawModeLineDelta * ch->fontSize ||
1659 	      ch2->xMin - ch->xMax < -rawModeCharOverlap * ch->fontSize) {
1660 	    s->append(eol, eolLen);
1661 	  } else if (ch2->xMin - ch->xMax >
1662 		     rawModeWordSpacing * ch->fontSize) {
1663 	    s->append(space, spaceLen);
1664 	  }
1665 	  break;
1666 	case 1:
1667 	  if (fabs(ch->xMax - ch2->xMax) > rawModeLineDelta * ch->fontSize ||
1668 	      ch2->yMin - ch->yMax < -rawModeCharOverlap * ch->fontSize) {
1669 	    s->append(eol, eolLen);
1670 	  } else if (ch2->yMin - ch->yMax >
1671 		     rawModeWordSpacing * ch->fontSize) {
1672 	    s->append(space, spaceLen);
1673 	  }
1674 	  break;
1675 	case 2:
1676 	  if (fabs(ch->yMax - ch2->yMax) > rawModeLineDelta * ch->fontSize ||
1677 	      ch->xMin - ch2->xMax  < -rawModeCharOverlap * ch->fontSize) {
1678 	    s->append(eol, eolLen);
1679 	  } else if (ch->xMin - ch2->xMax >
1680 		     rawModeWordSpacing * ch->fontSize) {
1681 	    s->append(space, spaceLen);
1682 	  }
1683 	  break;
1684 	case 3:
1685 	  if (fabs(ch2->xMin - ch->xMin) > rawModeLineDelta * ch->fontSize ||
1686 	      ch->yMin - ch2->yMax  < -rawModeCharOverlap * ch->fontSize) {
1687 	    s->append(eol, eolLen);
1688 	  } else if (ch->yMin - ch2->yMax >
1689 		     rawModeWordSpacing * ch->fontSize) {
1690 	    s->append(space, spaceLen);
1691 	  }
1692 	  break;
1693 	}
1694       }
1695     } else {
1696       s->append(eol, eolLen);
1697     }
1698 
1699     if (s->getLength() > 1000) {
1700       (*outputFunc)(outputStream, s->getCString(), s->getLength());
1701       s->clear();
1702     }
1703   }
1704 
1705   if (s->getLength() > 0) {
1706     (*outputFunc)(outputStream, s->getCString(), s->getLength());
1707   }
1708   delete s;
1709 }
1710 
encodeFragment(Unicode * text,int len,UnicodeMap * uMap,GBool primaryLR,GString * s)1711 void TextPage::encodeFragment(Unicode *text, int len, UnicodeMap *uMap,
1712 			      GBool primaryLR, GString *s) {
1713   char lre[8], rle[8], popdf[8], buf[8];
1714   int lreLen, rleLen, popdfLen, n;
1715   int i, j, k;
1716 
1717   if (uMap->isUnicode()) {
1718 
1719     lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
1720     rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
1721     popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
1722 
1723     if (primaryLR) {
1724 
1725       i = 0;
1726       while (i < len) {
1727 	// output a left-to-right section
1728 	for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
1729 	for (k = i; k < j; ++k) {
1730 	  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
1731 	  s->append(buf, n);
1732 	}
1733 	i = j;
1734 	// output a right-to-left section
1735 	for (j = i;
1736 	     j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
1737 	     ++j) ;
1738 	if (j > i) {
1739 	  s->append(rle, rleLen);
1740 	  for (k = j - 1; k >= i; --k) {
1741 	    n = uMap->mapUnicode(text[k], buf, sizeof(buf));
1742 	    s->append(buf, n);
1743 	  }
1744 	  s->append(popdf, popdfLen);
1745 	  i = j;
1746 	}
1747       }
1748 
1749     } else {
1750 
1751       // Note: This code treats numeric characters (European and
1752       // Arabic/Indic) as left-to-right, which isn't strictly correct
1753       // (incurs extra LRE/POPDF pairs), but does produce correct
1754       // visual formatting.
1755       s->append(rle, rleLen);
1756       i = len - 1;
1757       while (i >= 0) {
1758 	// output a right-to-left section
1759 	for (j = i;
1760 	     j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
1761 	     --j) ;
1762 	for (k = i; k > j; --k) {
1763 	  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
1764 	  s->append(buf, n);
1765 	}
1766 	i = j;
1767 	// output a left-to-right section
1768 	for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
1769 	if (j < i) {
1770 	  s->append(lre, lreLen);
1771 	  for (k = j + 1; k <= i; ++k) {
1772 	    n = uMap->mapUnicode(text[k], buf, sizeof(buf));
1773 	    s->append(buf, n);
1774 	  }
1775 	  s->append(popdf, popdfLen);
1776 	  i = j;
1777 	}
1778       }
1779       s->append(popdf, popdfLen);
1780     }
1781 
1782   } else {
1783     for (i = 0; i < len; ++i) {
1784       n = uMap->mapUnicode(text[i], buf, sizeof(buf));
1785       s->append(buf, n);
1786     }
1787   }
1788 }
1789 
1790 //------------------------------------------------------------------------
1791 // TextPage: layout analysis
1792 //------------------------------------------------------------------------
1793 
1794 // Determine primary (most common) rotation value.  Rotate all chars
1795 // to that primary rotation.
rotateChars(GList * charsA)1796 int TextPage::rotateChars(GList *charsA) {
1797   TextChar *ch;
1798   int nChars[4];
1799   double xMin, yMin, xMax, yMax, t;
1800   int rot, i;
1801 
1802   // determine primary rotation
1803   nChars[0] = nChars[1] = nChars[2] = nChars[3] = 0;
1804   for (i = 0; i < charsA->getLength(); ++i) {
1805     ch = (TextChar *)charsA->get(i);
1806     ++nChars[ch->rot];
1807   }
1808   rot = 0;
1809   for (i = 1; i < 4; ++i) {
1810     if (nChars[i] > nChars[rot]) {
1811       rot = i;
1812     }
1813   }
1814 
1815   // rotate
1816   switch (rot) {
1817   case 0:
1818   default:
1819     break;
1820   case 1:
1821     for (i = 0; i < charsA->getLength(); ++i) {
1822       ch = (TextChar *)charsA->get(i);
1823       xMin = ch->yMin;
1824       xMax = ch->yMax;
1825       yMin = pageWidth - ch->xMax;
1826       yMax = pageWidth - ch->xMin;
1827       ch->xMin = xMin;
1828       ch->xMax = xMax;
1829       ch->yMin = yMin;
1830       ch->yMax = yMax;
1831       ch->rot = (ch->rot + 3) & 3;
1832     }
1833     t = pageWidth;
1834     pageWidth = pageHeight;
1835     pageHeight = t;
1836     break;
1837   case 2:
1838     for (i = 0; i < charsA->getLength(); ++i) {
1839       ch = (TextChar *)charsA->get(i);
1840       xMin = pageWidth - ch->xMax;
1841       xMax = pageWidth - ch->xMin;
1842       yMin = pageHeight - ch->yMax;
1843       yMax = pageHeight - ch->yMin;
1844       ch->xMin = xMin;
1845       ch->xMax = xMax;
1846       ch->yMin = yMin;
1847       ch->yMax = yMax;
1848       ch->rot = (ch->rot + 2) & 3;
1849     }
1850     break;
1851   case 3:
1852     for (i = 0; i < charsA->getLength(); ++i) {
1853       ch = (TextChar *)charsA->get(i);
1854       xMin = pageHeight - ch->yMax;
1855       xMax = pageHeight - ch->yMin;
1856       yMin = ch->xMin;
1857       yMax = ch->xMax;
1858       ch->xMin = xMin;
1859       ch->xMax = xMax;
1860       ch->yMin = yMin;
1861       ch->yMax = yMax;
1862       ch->rot = (ch->rot + 1) & 3;
1863     }
1864     t = pageWidth;
1865     pageWidth = pageHeight;
1866     pageHeight = t;
1867     break;
1868   }
1869 
1870   return rot;
1871 }
1872 
1873 // Rotate the TextUnderlines and TextLinks to match the transform
1874 // performed by rotateChars().
rotateUnderlinesAndLinks(int rot)1875 void TextPage::rotateUnderlinesAndLinks(int rot) {
1876   TextUnderline *underline;
1877   TextLink *link;
1878   double xMin, yMin, xMax, yMax;
1879   int i;
1880 
1881   switch (rot) {
1882   case 0:
1883   default:
1884     break;
1885   case 1:
1886     for (i = 0; i < underlines->getLength(); ++i) {
1887       underline = (TextUnderline *)underlines->get(i);
1888       xMin = underline->y0;
1889       xMax = underline->y1;
1890       yMin = pageWidth - underline->x1;
1891       yMax = pageWidth - underline->x0;
1892       underline->x0 = xMin;
1893       underline->x1 = xMax;
1894       underline->y0 = yMin;
1895       underline->y1 = yMax;
1896       underline->horiz = !underline->horiz;
1897     }
1898     for (i = 0; i < links->getLength(); ++i) {
1899       link = (TextLink *)links->get(i);
1900       xMin = link->yMin;
1901       xMax = link->yMax;
1902       yMin = pageWidth - link->xMax;
1903       yMax = pageWidth - link->xMin;
1904       link->xMin = xMin;
1905       link->xMax = xMax;
1906       link->yMin = yMin;
1907       link->yMax = yMax;
1908     }
1909     break;
1910   case 2:
1911     for (i = 0; i < underlines->getLength(); ++i) {
1912       underline = (TextUnderline *)underlines->get(i);
1913       xMin = pageWidth - underline->x1;
1914       xMax = pageWidth - underline->x0;
1915       yMin = pageHeight - underline->y1;
1916       yMax = pageHeight - underline->y0;
1917       underline->x0 = xMin;
1918       underline->x1 = xMax;
1919       underline->y0 = yMin;
1920       underline->y1 = yMax;
1921     }
1922     for (i = 0; i < links->getLength(); ++i) {
1923       link = (TextLink *)links->get(i);
1924       xMin = pageWidth - link->xMax;
1925       xMax = pageWidth - link->xMin;
1926       yMin = pageHeight - link->yMax;
1927       yMax = pageHeight - link->yMin;
1928       link->xMin = xMin;
1929       link->xMax = xMax;
1930       link->yMin = yMin;
1931       link->yMax = yMax;
1932     }
1933     break;
1934   case 3:
1935     for (i = 0; i < underlines->getLength(); ++i) {
1936       underline = (TextUnderline *)underlines->get(i);
1937       xMin = pageHeight - underline->y1;
1938       xMax = pageHeight - underline->y0;
1939       yMin = underline->x0;
1940       yMax = underline->x1;
1941       underline->x0 = xMin;
1942       underline->x1 = xMax;
1943       underline->y0 = yMin;
1944       underline->y1 = yMax;
1945       underline->horiz = !underline->horiz;
1946     }
1947     for (i = 0; i < links->getLength(); ++i) {
1948       link = (TextLink *)links->get(i);
1949       xMin = pageHeight - link->yMax;
1950       xMax = pageHeight - link->yMin;
1951       yMin = link->xMin;
1952       yMax = link->xMax;
1953       link->xMin = xMin;
1954       link->xMax = xMax;
1955       link->yMin = yMin;
1956       link->yMax = yMax;
1957     }
1958     break;
1959   }
1960 }
1961 
1962 // Undo the coordinate transform performed by rotateChars().
unrotateChars(GList * charsA,int rot)1963 void TextPage::unrotateChars(GList *charsA, int rot) {
1964   TextChar *ch;
1965   double xMin, yMin, xMax, yMax, t;
1966   int i;
1967 
1968   switch (rot) {
1969   case 0:
1970   default:
1971     // no transform
1972     break;
1973   case 1:
1974     t = pageWidth;
1975     pageWidth = pageHeight;
1976     pageHeight = t;
1977     for (i = 0; i < charsA->getLength(); ++i) {
1978       ch = (TextChar *)charsA->get(i);
1979       xMin = pageWidth - ch->yMax;
1980       xMax = pageWidth - ch->yMin;
1981       yMin = ch->xMin;
1982       yMax = ch->xMax;
1983       ch->xMin = xMin;
1984       ch->xMax = xMax;
1985       ch->yMin = yMin;
1986       ch->yMax = yMax;
1987       ch->rot = (ch->rot + 1) & 3;
1988     }
1989     break;
1990   case 2:
1991     for (i = 0; i < charsA->getLength(); ++i) {
1992       ch = (TextChar *)charsA->get(i);
1993       xMin = pageWidth - ch->xMax;
1994       xMax = pageWidth - ch->xMin;
1995       yMin = pageHeight - ch->yMax;
1996       yMax = pageHeight - ch->yMin;
1997       ch->xMin = xMin;
1998       ch->xMax = xMax;
1999       ch->yMin = yMin;
2000       ch->yMax = yMax;
2001       ch->rot = (ch->rot + 2) & 3;
2002     }
2003     break;
2004   case 3:
2005     t = pageWidth;
2006     pageWidth = pageHeight;
2007     pageHeight = t;
2008     for (i = 0; i < charsA->getLength(); ++i) {
2009       ch = (TextChar *)charsA->get(i);
2010       xMin = ch->yMin;
2011       xMax = ch->yMax;
2012       yMin = pageHeight - ch->xMax;
2013       yMax = pageHeight - ch->xMin;
2014       ch->xMin = xMin;
2015       ch->xMax = xMax;
2016       ch->yMin = yMin;
2017       ch->yMax = yMax;
2018       ch->rot = (ch->rot + 3) & 3;
2019     }
2020     break;
2021   }
2022 }
2023 
2024 // Undo the coordinate transform performed by rotateChars().
unrotateColumns(GList * columns,int rot)2025 void TextPage::unrotateColumns(GList *columns, int rot) {
2026   TextColumn *col;
2027   TextParagraph *par;
2028   TextLine *line;
2029   TextWord *word;
2030   double xMin, yMin, xMax, yMax, t;
2031   int colIdx, parIdx, lineIdx, wordIdx, i;
2032 
2033   switch (rot) {
2034   case 0:
2035   default:
2036     // no transform
2037     break;
2038   case 1:
2039     t = pageWidth;
2040     pageWidth = pageHeight;
2041     pageHeight = t;
2042     for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
2043       col = (TextColumn *)columns->get(colIdx);
2044       xMin = pageWidth - col->yMax;
2045       xMax = pageWidth - col->yMin;
2046       yMin = col->xMin;
2047       yMax = col->xMax;
2048       col->xMin = xMin;
2049       col->xMax = xMax;
2050       col->yMin = yMin;
2051       col->yMax = yMax;
2052       for (parIdx = 0;
2053 	   parIdx < col->paragraphs->getLength();
2054 	   ++parIdx) {
2055 	par = (TextParagraph *)col->paragraphs->get(parIdx);
2056 	xMin = pageWidth - par->yMax;
2057 	xMax = pageWidth - par->yMin;
2058 	yMin = par->xMin;
2059 	yMax = par->xMax;
2060 	par->xMin = xMin;
2061 	par->xMax = xMax;
2062 	par->yMin = yMin;
2063 	par->yMax = yMax;
2064 	for (lineIdx = 0;
2065 	     lineIdx < par->lines->getLength();
2066 	     ++lineIdx) {
2067 	  line = (TextLine *)par->lines->get(lineIdx);
2068 	  xMin = pageWidth - line->yMax;
2069 	  xMax = pageWidth - line->yMin;
2070 	  yMin = line->xMin;
2071 	  yMax = line->xMax;
2072 	  line->xMin = xMin;
2073 	  line->xMax = xMax;
2074 	  line->yMin = yMin;
2075 	  line->yMax = yMax;
2076 	  line->rot = (line->rot + 1) & 3;
2077 	  for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
2078 	    word = (TextWord *)line->words->get(wordIdx);
2079 	    xMin = pageWidth - word->yMax;
2080 	    xMax = pageWidth - word->yMin;
2081 	    yMin = word->xMin;
2082 	    yMax = word->xMax;
2083 	    word->xMin = xMin;
2084 	    word->xMax = xMax;
2085 	    word->yMin = yMin;
2086 	    word->yMax = yMax;
2087 	    word->rot = (word->rot + 1) & 3;
2088 	  }
2089 	}
2090       }
2091     }
2092     break;
2093   case 2:
2094     for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
2095       col = (TextColumn *)columns->get(colIdx);
2096       xMin = pageWidth - col->xMax;
2097       xMax = pageWidth - col->xMin;
2098       yMin = pageHeight - col->yMax;
2099       yMax = pageHeight - col->yMin;
2100       col->xMin = xMin;
2101       col->xMax = xMax;
2102       col->yMin = yMin;
2103       col->yMax = yMax;
2104       for (parIdx = 0;
2105 	   parIdx < col->paragraphs->getLength();
2106 	   ++parIdx) {
2107 	par = (TextParagraph *)col->paragraphs->get(parIdx);
2108 	xMin = pageWidth - par->xMax;
2109 	xMax = pageWidth - par->xMin;
2110 	yMin = pageHeight - par->yMax;
2111 	yMax = pageHeight - par->yMin;
2112 	par->xMin = xMin;
2113 	par->xMax = xMax;
2114 	par->yMin = yMin;
2115 	par->yMax = yMax;
2116 	for (lineIdx = 0;
2117 	     lineIdx < par->lines->getLength();
2118 	     ++lineIdx) {
2119 	  line = (TextLine *)par->lines->get(lineIdx);
2120 	  xMin = pageWidth - line->xMax;
2121 	  xMax = pageWidth - line->xMin;
2122 	  yMin = pageHeight - line->yMax;
2123 	  yMax = pageHeight - line->yMin;
2124 	  line->xMin = xMin;
2125 	  line->xMax = xMax;
2126 	  line->yMin = yMin;
2127 	  line->yMax = yMax;
2128 	  line->rot = (line->rot + 2) & 3;
2129 	  for (i = 0; i <= line->len; ++i) {
2130 	    line->edge[i] = pageWidth - line->edge[i];
2131 	  }
2132 	  for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
2133 	    word = (TextWord *)line->words->get(wordIdx);
2134 	    xMin = pageWidth - word->xMax;
2135 	    xMax = pageWidth - word->xMin;
2136 	    yMin = pageHeight - word->yMax;
2137 	    yMax = pageHeight - word->yMin;
2138 	    word->xMin = xMin;
2139 	    word->xMax = xMax;
2140 	    word->yMin = yMin;
2141 	    word->yMax = yMax;
2142 	    word->rot = (word->rot + 2) & 3;
2143 	    for (i = 0; i <= word->len; ++i) {
2144 	      word->edge[i] = pageWidth - word->edge[i];
2145 	    }
2146 	  }
2147 	}
2148       }
2149     }
2150     break;
2151   case 3:
2152     t = pageWidth;
2153     pageWidth = pageHeight;
2154     pageHeight = t;
2155     for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
2156       col = (TextColumn *)columns->get(colIdx);
2157       xMin = col->yMin;
2158       xMax = col->yMax;
2159       yMin = pageHeight - col->xMax;
2160       yMax = pageHeight - col->xMin;
2161       col->xMin = xMin;
2162       col->xMax = xMax;
2163       col->yMin = yMin;
2164       col->yMax = yMax;
2165       for (parIdx = 0;
2166 	   parIdx < col->paragraphs->getLength();
2167 	   ++parIdx) {
2168 	par = (TextParagraph *)col->paragraphs->get(parIdx);
2169 	xMin = par->yMin;
2170 	xMax = par->yMax;
2171 	yMin = pageHeight - par->xMax;
2172 	yMax = pageHeight - par->xMin;
2173 	par->xMin = xMin;
2174 	par->xMax = xMax;
2175 	par->yMin = yMin;
2176 	par->yMax = yMax;
2177 	for (lineIdx = 0;
2178 	     lineIdx < par->lines->getLength();
2179 	     ++lineIdx) {
2180 	  line = (TextLine *)par->lines->get(lineIdx);
2181 	  xMin = line->yMin;
2182 	  xMax = line->yMax;
2183 	  yMin = pageHeight - line->xMax;
2184 	  yMax = pageHeight - line->xMin;
2185 	  line->xMin = xMin;
2186 	  line->xMax = xMax;
2187 	  line->yMin = yMin;
2188 	  line->yMax = yMax;
2189 	  line->rot = (line->rot + 3) & 3;
2190 	  for (i = 0; i <= line->len; ++i) {
2191 	    line->edge[i] = pageHeight - line->edge[i];
2192 	  }
2193 	  for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
2194 	    word = (TextWord *)line->words->get(wordIdx);
2195 	    xMin = word->yMin;
2196 	    xMax = word->yMax;
2197 	    yMin = pageHeight - word->xMax;
2198 	    yMax = pageHeight - word->xMin;
2199 	    word->xMin = xMin;
2200 	    word->xMax = xMax;
2201 	    word->yMin = yMin;
2202 	    word->yMax = yMax;
2203 	    word->rot = (word->rot + 3) & 3;
2204 	    for (i = 0; i <= word->len; ++i) {
2205 	      word->edge[i] = pageHeight - word->edge[i];
2206 	    }
2207 	  }
2208 	}
2209       }
2210     }
2211     break;
2212   }
2213 }
2214 
unrotateWords(GList * words,int rot)2215 void TextPage::unrotateWords(GList *words, int rot) {
2216   TextWord *word;
2217   double xMin, yMin, xMax, yMax;
2218   int i, j;
2219 
2220   switch (rot) {
2221   case 0:
2222   default:
2223     // no transform
2224     break;
2225   case 1:
2226     for (i = 0; i < words->getLength(); ++i) {
2227       word = (TextWord *)words->get(i);
2228       xMin = pageWidth - word->yMax;
2229       xMax = pageWidth - word->yMin;
2230       yMin = word->xMin;
2231       yMax = word->xMax;
2232       word->xMin = xMin;
2233       word->xMax = xMax;
2234       word->yMin = yMin;
2235       word->yMax = yMax;
2236       word->rot = (word->rot + 1) & 3;
2237     }
2238     break;
2239   case 2:
2240     for (i = 0; i < words->getLength(); ++i) {
2241       word = (TextWord *)words->get(i);
2242       xMin = pageWidth - word->xMax;
2243       xMax = pageWidth - word->xMin;
2244       yMin = pageHeight - word->yMax;
2245       yMax = pageHeight - word->yMin;
2246       word->xMin = xMin;
2247       word->xMax = xMax;
2248       word->yMin = yMin;
2249       word->yMax = yMax;
2250       word->rot = (word->rot + 2) & 3;
2251       for (j = 0; j <= word->len; ++j) {
2252 	word->edge[j] = pageWidth - word->edge[j];
2253       }
2254     }
2255     break;
2256   case 3:
2257     for (i = 0; i < words->getLength(); ++i) {
2258       word = (TextWord *)words->get(i);
2259       xMin = word->yMin;
2260       xMax = word->yMax;
2261       yMin = pageHeight - word->xMax;
2262       yMax = pageHeight - word->xMin;
2263       word->xMin = xMin;
2264       word->xMax = xMax;
2265       word->yMin = yMin;
2266       word->yMax = yMax;
2267       word->rot = (word->rot + 3) & 3;
2268       for (j = 0; j <= word->len; ++j) {
2269 	word->edge[j] = pageHeight - word->edge[j];
2270       }
2271     }
2272     break;
2273   }
2274 }
2275 
2276 // Determine the primary text direction (LR vs RL).  Returns true for
2277 // LR, false for RL.
checkPrimaryLR(GList * charsA)2278 GBool TextPage::checkPrimaryLR(GList *charsA) {
2279   TextChar *ch;
2280   int i, lrCount;
2281 
2282   lrCount = 0;
2283   for (i = 0; i < charsA->getLength(); ++i) {
2284     ch = (TextChar *)charsA->get(i);
2285     if (unicodeTypeL(ch->c)) {
2286       ++lrCount;
2287     } else if (unicodeTypeR(ch->c)) {
2288       --lrCount;
2289     }
2290   }
2291   return lrCount >= 0;
2292 }
2293 
2294 // Remove duplicate characters.  The list of chars has been sorted --
2295 // by x for rot=0,2; by y for rot=1,3.
removeDuplicates(GList * charsA,int rot)2296 void TextPage::removeDuplicates(GList *charsA, int rot) {
2297   TextChar *ch, *ch2;
2298   double xDelta, yDelta;
2299   int i, j;
2300 
2301   if (rot & 1) {
2302     for (i = 0; i < charsA->getLength(); ++i) {
2303       ch = (TextChar *)charsA->get(i);
2304       xDelta = dupMaxSecDelta * ch->fontSize;
2305       yDelta = dupMaxPriDelta * ch->fontSize;
2306       j = i + 1;
2307       while (j < charsA->getLength()) {
2308 	ch2 = (TextChar *)charsA->get(j);
2309 	if (ch2->yMin - ch->yMin >= yDelta) {
2310 	  break;
2311 	}
2312 	if (ch2->c == ch->c &&
2313 	    fabs(ch2->xMin - ch->xMin) < xDelta &&
2314 	    fabs(ch2->xMax - ch->xMax) < xDelta &&
2315 	    fabs(ch2->yMax - ch->yMax) < yDelta) {
2316 	  charsA->del(j);
2317 	} else {
2318 	  ++j;
2319 	}
2320       }
2321     }
2322   } else {
2323     for (i = 0; i < charsA->getLength(); ++i) {
2324       ch = (TextChar *)charsA->get(i);
2325       xDelta = dupMaxPriDelta * ch->fontSize;
2326       yDelta = dupMaxSecDelta * ch->fontSize;
2327       j = i + 1;
2328       while (j < charsA->getLength()) {
2329 	ch2 = (TextChar *)charsA->get(j);
2330 	if (ch2->xMin - ch->xMin >= xDelta) {
2331 	  break;
2332 	}
2333 	if (ch2->c == ch->c &&
2334 	    fabs(ch2->xMax - ch->xMax) < xDelta &&
2335 	    fabs(ch2->yMin - ch->yMin) < yDelta &&
2336 	    fabs(ch2->yMax - ch->yMax) < yDelta) {
2337 	  charsA->del(j);
2338 	} else {
2339 	  ++j;
2340 	}
2341       }
2342     }
2343   }
2344 }
2345 
2346 // Split the characters into trees of TextBlocks, one tree for each
2347 // rotation.  Merge into a single tree (with the primary rotation).
splitChars(GList * charsA)2348 TextBlock *TextPage::splitChars(GList *charsA) {
2349   TextBlock *tree[4];
2350   TextBlock *blk;
2351   GList *chars2, *clippedChars;
2352   TextChar *ch;
2353   int rot, i;
2354 
2355   // split: build a tree of TextBlocks for each rotation
2356   clippedChars = new GList();
2357   for (rot = 0; rot < 4; ++rot) {
2358     chars2 = new GList();
2359     for (i = 0; i < charsA->getLength(); ++i) {
2360       ch = (TextChar *)charsA->get(i);
2361       if (ch->rot == rot) {
2362 	chars2->append(ch);
2363       }
2364     }
2365     tree[rot] = NULL;
2366     if (chars2->getLength() > 0) {
2367       chars2->sort((rot & 1) ? &TextChar::cmpY : &TextChar::cmpX);
2368       removeDuplicates(chars2, rot);
2369       if (control.clipText) {
2370 	i = 0;
2371 	while (i < chars2->getLength()) {
2372 	  ch = (TextChar *)chars2->get(i);
2373 	  if (ch->clipped) {
2374 	    ch = (TextChar *)chars2->del(i);
2375 	    clippedChars->append(ch);
2376 	  } else {
2377 	    ++i;
2378 	  }
2379 	}
2380       }
2381       if (chars2->getLength() > 0) {
2382 	tree[rot] = split(chars2, rot);
2383       }
2384     }
2385     delete chars2;
2386   }
2387 
2388   // if the page contains no (unclipped) text, just leave an empty
2389   // column list
2390   if (!tree[0]) {
2391     delete clippedChars;
2392     return NULL;
2393   }
2394 
2395   // if the main tree is not a multicolumn node, insert one so that
2396   // rotated text has somewhere to go
2397   if (tree[0]->tag != blkTagMulticolumn) {
2398     blk = new TextBlock(blkHorizSplit, 0);
2399     blk->addChild(tree[0]);
2400     blk->tag = blkTagMulticolumn;
2401     tree[0] = blk;
2402   }
2403 
2404   // merge non-primary-rotation text into the primary-rotation tree
2405   for (rot = 1; rot < 4; ++rot) {
2406     if (tree[rot]) {
2407       insertIntoTree(tree[rot], tree[0]);
2408       tree[rot] = NULL;
2409     }
2410   }
2411 
2412   if (clippedChars->getLength()) {
2413     insertClippedChars(clippedChars, tree[0]);
2414   }
2415   delete clippedChars;
2416 
2417 #if 0 //~debug
2418   dumpTree(tree[0]);
2419 #endif
2420 
2421   return tree[0];
2422 }
2423 
2424 // Generate a tree of TextBlocks, marked as columns, lines, and words.
split(GList * charsA,int rot)2425 TextBlock *TextPage::split(GList *charsA, int rot) {
2426   TextBlock *blk;
2427   GList *chars2, *chars3;
2428   int *horizProfile, *vertProfile;
2429   double xMin, yMin, xMax, yMax;
2430   int xMinI, yMinI, xMaxI, yMaxI;
2431   int xMinI2, yMinI2, xMaxI2, yMaxI2;
2432   TextChar *ch;
2433   double minFontSize, avgFontSize, splitPrecision;
2434   double nLines, vertGapThreshold, ascentAdjust, descentAdjust, minChunk;
2435   int horizGapSize, vertGapSize;
2436   double horizGapSize2, vertGapSize2;
2437   int minHorizChunkWidth, minVertChunkWidth, nHorizGaps, nVertGaps;
2438   double largeCharSize;
2439   int nLargeChars;
2440   GBool doHorizSplit, doVertSplit, smallSplit;
2441   int i, x, y, prev, start;
2442 
2443   //----- compute bbox, min font size, average font size, and
2444   //      split precision for this block
2445 
2446   xMin = yMin = xMax = yMax = 0; // make gcc happy
2447   minFontSize = avgFontSize = 0;
2448   for (i = 0; i < charsA->getLength(); ++i) {
2449     ch = (TextChar *)charsA->get(i);
2450     if (i == 0 || ch->xMin < xMin) {
2451       xMin = ch->xMin;
2452     }
2453     if (i == 0 || ch->yMin < yMin) {
2454       yMin = ch->yMin;
2455     }
2456     if (i == 0 || ch->xMax > xMax) {
2457       xMax = ch->xMax;
2458     }
2459     if (i == 0 || ch->yMax > yMax) {
2460       yMax = ch->yMax;
2461     }
2462     avgFontSize += ch->fontSize;
2463     if (i == 0 || ch->fontSize < minFontSize) {
2464       minFontSize = ch->fontSize;
2465     }
2466   }
2467   avgFontSize /= charsA->getLength();
2468   splitPrecision = splitPrecisionMul * minFontSize;
2469   if (splitPrecision < minSplitPrecision) {
2470     splitPrecision = minSplitPrecision;
2471   }
2472 
2473   //----- compute the horizontal and vertical profiles
2474 
2475   if (xMin / splitPrecision < 0.5 * INT_MIN ||
2476       xMax / splitPrecision > 0.5 * INT_MAX ||
2477       yMin / splitPrecision < 0.5 * INT_MIN ||
2478       xMax / splitPrecision > 0.5 * INT_MAX) {
2479     return NULL;
2480   }
2481   // add some slack to the array bounds to avoid floating point
2482   // precision problems
2483   xMinI = (int)floor(xMin / splitPrecision) - 1;
2484   yMinI = (int)floor(yMin / splitPrecision) - 1;
2485   xMaxI = (int)floor(xMax / splitPrecision) + 1;
2486   yMaxI = (int)floor(yMax / splitPrecision) + 1;
2487   horizProfile = (int *)gmallocn(yMaxI - yMinI + 1, sizeof(int));
2488   vertProfile = (int *)gmallocn(xMaxI - xMinI + 1, sizeof(int));
2489   memset(horizProfile, 0, (yMaxI - yMinI + 1) * sizeof(int));
2490   memset(vertProfile, 0, (xMaxI - xMinI + 1) * sizeof(int));
2491   for (i = 0; i < charsA->getLength(); ++i) {
2492     ch = (TextChar *)charsA->get(i);
2493     // yMinI2 and yMaxI2 are adjusted to allow for slightly overlapping lines
2494     switch (rot) {
2495     case 0:
2496     default:
2497       xMinI2 = (int)floor(ch->xMin / splitPrecision);
2498       xMaxI2 = (int)floor(ch->xMax / splitPrecision);
2499       ascentAdjust = ascentAdjustFactor * (ch->yMax - ch->yMin);
2500       yMinI2 = (int)floor((ch->yMin + ascentAdjust) / splitPrecision);
2501       descentAdjust = descentAdjustFactor * (ch->yMax - ch->yMin);
2502       yMaxI2 = (int)floor((ch->yMax - descentAdjust) / splitPrecision);
2503       break;
2504     case 1:
2505       descentAdjust = descentAdjustFactor * (ch->xMax - ch->xMin);
2506       xMinI2 = (int)floor((ch->xMin + descentAdjust) / splitPrecision);
2507       ascentAdjust = ascentAdjustFactor * (ch->xMax - ch->xMin);
2508       xMaxI2 = (int)floor((ch->xMax - ascentAdjust) / splitPrecision);
2509       yMinI2 = (int)floor(ch->yMin / splitPrecision);
2510       yMaxI2 = (int)floor(ch->yMax / splitPrecision);
2511       break;
2512     case 2:
2513       xMinI2 = (int)floor(ch->xMin / splitPrecision);
2514       xMaxI2 = (int)floor(ch->xMax / splitPrecision);
2515       descentAdjust = descentAdjustFactor * (ch->yMax - ch->yMin);
2516       yMinI2 = (int)floor((ch->yMin + descentAdjust) / splitPrecision);
2517       ascentAdjust = ascentAdjustFactor * (ch->yMax - ch->yMin);
2518       yMaxI2 = (int)floor((ch->yMax - ascentAdjust) / splitPrecision);
2519       break;
2520     case 3:
2521       ascentAdjust = ascentAdjustFactor * (ch->xMax - ch->xMin);
2522       xMinI2 = (int)floor((ch->xMin + ascentAdjust) / splitPrecision);
2523       descentAdjust = descentAdjustFactor * (ch->xMax - ch->xMin);
2524       xMaxI2 = (int)floor((ch->xMax - descentAdjust) / splitPrecision);
2525       yMinI2 = (int)floor(ch->yMin / splitPrecision);
2526       yMaxI2 = (int)floor(ch->yMax / splitPrecision);
2527       break;
2528     }
2529     for (y = yMinI2; y <= yMaxI2; ++y) {
2530       ++horizProfile[y - yMinI];
2531     }
2532     for (x = xMinI2; x <= xMaxI2; ++x) {
2533       ++vertProfile[x - xMinI];
2534     }
2535   }
2536 
2537   //----- find the largest gaps in the horizontal and vertical profiles
2538 
2539   horizGapSize = 0;
2540   for (start = yMinI; start < yMaxI && !horizProfile[start - yMinI]; ++start) ;
2541   for (y = start; y < yMaxI; ++y) {
2542     if (horizProfile[y - yMinI] && !horizProfile[y + 1 - yMinI]) {
2543       start = y;
2544     } else if (!horizProfile[y - yMinI] && horizProfile[y + 1 - yMinI]) {
2545       if (y - start > horizGapSize) {
2546 	horizGapSize = y - start;
2547       }
2548     }
2549   }
2550   vertGapSize = 0;
2551   for (start = xMinI; start < xMaxI && !vertProfile[start - xMinI]; ++start) ;
2552   for (x = start; x < xMaxI; ++x) {
2553     if (vertProfile[x - xMinI] && !vertProfile[x + 1 - xMinI]) {
2554       start = x;
2555     } else if (!vertProfile[x - xMinI] && vertProfile[x + 1 - xMinI]) {
2556       if (x - start > vertGapSize) {
2557 	vertGapSize = x - start;
2558       }
2559     }
2560   }
2561   horizGapSize2 = horizGapSize - splitGapSlack * avgFontSize / splitPrecision;
2562   if (horizGapSize2 < 0.99) {
2563     horizGapSize2 = 0.99;
2564   }
2565   vertGapSize2 = vertGapSize - splitGapSlack * avgFontSize / splitPrecision;
2566   if (vertGapSize2 < 0.99) {
2567     vertGapSize2 = 0.99;
2568   }
2569 
2570   //----- count horiz/vert gaps equivalent to largest gaps
2571 
2572   minHorizChunkWidth = yMaxI - yMinI;
2573   nHorizGaps = 0;
2574   for (start = yMinI; start < yMaxI && !horizProfile[start - yMinI]; ++start) ;
2575   prev = start - 1;
2576   for (y = start; y < yMaxI; ++y) {
2577     if (horizProfile[y - yMinI] && !horizProfile[y + 1 - yMinI]) {
2578       start = y;
2579     } else if (!horizProfile[y - yMinI] && horizProfile[y + 1 - yMinI]) {
2580       if (y - start > horizGapSize2) {
2581 	++nHorizGaps;
2582 	if (start - prev < minHorizChunkWidth) {
2583 	  minHorizChunkWidth = start - prev;
2584 	}
2585 	prev = y;
2586       }
2587     }
2588   }
2589   minVertChunkWidth = xMaxI - xMinI;
2590   nVertGaps = 0;
2591   for (start = xMinI; start < xMaxI && !vertProfile[start - xMinI]; ++start) ;
2592   prev = start - 1;
2593   for (x = start; x < xMaxI; ++x) {
2594     if (vertProfile[x - xMinI] && !vertProfile[x + 1 - xMinI]) {
2595       start = x;
2596     } else if (!vertProfile[x - xMinI] && vertProfile[x + 1 - xMinI]) {
2597       if (x - start > vertGapSize2) {
2598 	++nVertGaps;
2599 	if (start - prev < minVertChunkWidth) {
2600 	  minVertChunkWidth = start - prev;
2601 	}
2602 	prev = x;
2603       }
2604     }
2605   }
2606 
2607   //----- compute splitting parameters
2608 
2609   // approximation of number of lines in block
2610   if (fabs(avgFontSize) < 0.001) {
2611     nLines = 1;
2612   } else if (rot & 1) {
2613     nLines = (xMax - xMin) / avgFontSize;
2614   } else {
2615     nLines = (yMax - yMin) / avgFontSize;
2616   }
2617 
2618   // compute the minimum allowed vertical gap size
2619   // (this is a horizontal gap threshold for rot=1,3
2620   if (control.mode == textOutTableLayout) {
2621     vertGapThreshold = vertGapThresholdTableMax
2622                        + vertGapThresholdTableSlope * nLines;
2623     if (vertGapThreshold < vertGapThresholdTableMin) {
2624       vertGapThreshold = vertGapThresholdTableMin;
2625     }
2626   } else {
2627     vertGapThreshold = vertGapThresholdMax + vertGapThresholdSlope * nLines;
2628     if (vertGapThreshold < vertGapThresholdMin) {
2629       vertGapThreshold = vertGapThresholdMin;
2630     }
2631   }
2632   vertGapThreshold = vertGapThreshold * avgFontSize / splitPrecision;
2633 
2634   // compute the minimum allowed chunk width
2635   if (control.mode == textOutTableLayout) {
2636     minChunk = 0;
2637   } else {
2638     minChunk = vertSplitChunkThreshold * avgFontSize / splitPrecision;
2639   }
2640 
2641   // look for large chars
2642   // -- this kludge (multiply by 256, convert to int, divide by 256.0)
2643   //    prevents floating point stability issues on x86 with gcc, where
2644   //    largeCharSize could otherwise have slightly different values
2645   //    here and where it's used below to do the large char partition
2646   //    (because it gets truncated from 80 to 64 bits when spilled)
2647   largeCharSize = (int)(largeCharThreshold * avgFontSize * 256) / 256.0;
2648   nLargeChars = 0;
2649   for (i = 0; i < charsA->getLength(); ++i) {
2650     ch = (TextChar *)charsA->get(i);
2651     if (ch->fontSize > largeCharSize) {
2652       ++nLargeChars;
2653     }
2654   }
2655 
2656   // figure out which type of split to do
2657   doHorizSplit = doVertSplit = gFalse;
2658   smallSplit = gFalse;
2659   if (rot & 1) {
2660     if (nHorizGaps > 0 &&
2661 	(horizGapSize > vertGapSize || control.mode == textOutTableLayout) &&
2662 	horizGapSize > vertGapThreshold &&
2663 	minHorizChunkWidth > minChunk) {
2664       doHorizSplit = gTrue;
2665     } else if (nVertGaps > 0) {
2666       doVertSplit = gTrue;
2667     } else if (nLargeChars == 0 && nHorizGaps > 0) {
2668       doHorizSplit = gTrue;
2669       smallSplit = gTrue;
2670     }
2671   } else {
2672     if (nVertGaps > 0 &&
2673 	(vertGapSize > horizGapSize || control.mode == textOutTableLayout) &&
2674 	vertGapSize > vertGapThreshold &&
2675 	minVertChunkWidth > minChunk) {
2676       doVertSplit = gTrue;
2677     } else if (nHorizGaps > 0) {
2678       doHorizSplit = gTrue;
2679     } else if (nLargeChars == 0 && nVertGaps > 0) {
2680       doVertSplit = gTrue;
2681       smallSplit = gTrue;
2682     }
2683   }
2684 
2685   //----- split the block
2686 
2687   //~ this could use "other content" (vector graphics, rotated text) --
2688   //~ presence of other content in a gap means we should definitely split
2689 
2690   // split vertically
2691   if (doVertSplit) {
2692     blk = new TextBlock(blkVertSplit, rot);
2693     blk->smallSplit = smallSplit;
2694     for (start = xMinI; start < xMaxI && !vertProfile[start - xMinI]; ++start) ;
2695     prev = start - 1;
2696     for (x = start; x < xMaxI; ++x) {
2697       if (vertProfile[x - xMinI] && !vertProfile[x + 1 - xMinI]) {
2698 	start = x;
2699       } else if (!vertProfile[x - xMinI] && vertProfile[x + 1 - xMinI]) {
2700 	if (x - start > vertGapSize2) {
2701 	  chars2 = getChars(charsA, (prev + 0.5) * splitPrecision, yMin - 1,
2702 			    (start + 1.5) * splitPrecision, yMax + 1);
2703 	  blk->addChild(split(chars2, rot));
2704 	  delete chars2;
2705 	  prev = x;
2706 	}
2707       }
2708     }
2709     chars2 = getChars(charsA, (prev + 0.5) * splitPrecision, yMin - 1,
2710 		      xMax + 1, yMax + 1);
2711     blk->addChild(split(chars2, rot));
2712     delete chars2;
2713 
2714   // split horizontally
2715   } else if (doHorizSplit) {
2716     blk = new TextBlock(blkHorizSplit, rot);
2717     blk->smallSplit = smallSplit;
2718     for (start = yMinI;
2719 	 start < yMaxI && !horizProfile[start - yMinI];
2720 	 ++start) ;
2721     prev = start - 1;
2722     for (y = start; y < yMaxI; ++y) {
2723       if (horizProfile[y - yMinI] && !horizProfile[y + 1 - yMinI]) {
2724 	start = y;
2725       } else if (!horizProfile[y - yMinI] && horizProfile[y + 1 - yMinI]) {
2726 	if (y - start > horizGapSize2) {
2727 	  chars2 = getChars(charsA, xMin - 1, (prev + 0.5) * splitPrecision,
2728 			    xMax + 1, (start + 1.5) * splitPrecision);
2729 	  blk->addChild(split(chars2, rot));
2730 	  delete chars2;
2731 	  prev = y;
2732 	}
2733       }
2734     }
2735     chars2 = getChars(charsA, xMin - 1, (prev + 0.5) * splitPrecision,
2736 		      xMax + 1, yMax + 1);
2737     blk->addChild(split(chars2, rot));
2738     delete chars2;
2739 
2740   // split into larger and smaller chars
2741   } else if (nLargeChars > 0) {
2742     chars2 = new GList();
2743     chars3 = new GList();
2744     for (i = 0; i < charsA->getLength(); ++i) {
2745       ch = (TextChar *)charsA->get(i);
2746       if (ch->fontSize > largeCharSize) {
2747 	chars2->append(ch);
2748       } else {
2749 	chars3->append(ch);
2750       }
2751     }
2752     blk = split(chars3, rot);
2753     insertLargeChars(chars2, blk);
2754     delete chars2;
2755     delete chars3;
2756 
2757   // create a leaf node
2758   } else {
2759     blk = new TextBlock(blkLeaf, rot);
2760     for (i = 0; i < charsA->getLength(); ++i) {
2761       blk->addChild((TextChar *)charsA->get(i));
2762     }
2763   }
2764 
2765   gfree(horizProfile);
2766   gfree(vertProfile);
2767 
2768   tagBlock(blk);
2769 
2770   return blk;
2771 }
2772 
2773 // Return the subset of chars inside a rectangle.
getChars(GList * charsA,double xMin,double yMin,double xMax,double yMax)2774 GList *TextPage::getChars(GList *charsA, double xMin, double yMin,
2775 			  double xMax, double yMax) {
2776   GList *ret;
2777   TextChar *ch;
2778   double x, y;
2779   int i;
2780 
2781   ret = new GList();
2782   for (i = 0; i < charsA->getLength(); ++i) {
2783     ch = (TextChar *)charsA->get(i);
2784     // because of {ascent,descent}AdjustFactor, the y coords (or x
2785     // coords for rot 1,3) for the gaps will be a little bit tight --
2786     // so we use the center of the character here
2787     x = 0.5 * (ch->xMin + ch->xMax);
2788     y = 0.5 * (ch->yMin + ch->yMax);
2789     if (x > xMin && x < xMax && y > yMin && y < yMax) {
2790       ret->append(ch);
2791     }
2792   }
2793   return ret;
2794 }
2795 
2796 // Decide whether this block is a line, column, or multiple columns:
2797 // - all leaf nodes are lines
2798 // - horiz split nodes whose children are lines or columns are columns
2799 // - other horiz split nodes are multiple columns
2800 // - vert split nodes, with small gaps, whose children are lines are lines
2801 // - other vert split nodes are multiple columns
2802 // (for rot=1,3: the horiz and vert splits are swapped)
2803 // In table layout mode:
2804 // - all leaf nodes are lines
2805 // - vert split nodes, with small gaps, whose children are lines are lines
2806 // - everything else is multiple columns
tagBlock(TextBlock * blk)2807 void TextPage::tagBlock(TextBlock *blk) {
2808   TextBlock *child;
2809   int i;
2810 
2811   if (control.mode == textOutTableLayout) {
2812     if (blk->type == blkLeaf) {
2813       blk->tag = blkTagLine;
2814     } else if (blk->type == ((blk->rot & 1) ? blkHorizSplit : blkVertSplit) &&
2815 	       blk->smallSplit) {
2816       blk->tag = blkTagLine;
2817       for (i = 0; i < blk->children->getLength(); ++i) {
2818 	child = (TextBlock *)blk->children->get(i);
2819 	if (child->tag != blkTagLine) {
2820 	  blk->tag = blkTagMulticolumn;
2821 	  break;
2822 	}
2823       }
2824     } else {
2825       blk->tag = blkTagMulticolumn;
2826     }
2827     return;
2828   }
2829 
2830   if (blk->type == blkLeaf) {
2831     blk->tag = blkTagLine;
2832 
2833   } else {
2834     if (blk->type == ((blk->rot & 1) ? blkVertSplit : blkHorizSplit)) {
2835       blk->tag = blkTagColumn;
2836       for (i = 0; i < blk->children->getLength(); ++i) {
2837 	child = (TextBlock *)blk->children->get(i);
2838 	if (child->tag != blkTagColumn && child->tag != blkTagLine) {
2839 	  blk->tag = blkTagMulticolumn;
2840 	  break;
2841 	}
2842       }
2843     } else {
2844       if (blk->smallSplit) {
2845 	blk->tag = blkTagLine;
2846 	for (i = 0; i < blk->children->getLength(); ++i) {
2847 	  child = (TextBlock *)blk->children->get(i);
2848 	  if (child->tag != blkTagLine) {
2849 	    blk->tag = blkTagMulticolumn;
2850 	    break;
2851 	  }
2852 	}
2853       } else {
2854 	blk->tag = blkTagMulticolumn;
2855       }
2856     }
2857   }
2858 }
2859 
2860 // Insert a list of large characters into a tree.
insertLargeChars(GList * largeChars,TextBlock * blk)2861 void TextPage::insertLargeChars(GList *largeChars, TextBlock *blk) {
2862   TextChar *ch, *ch2;
2863   GBool singleLine;
2864   double xLimit, yLimit, minOverlap;
2865   int i;
2866 
2867   //~ this currently works only for characters in the primary rotation
2868 
2869   // check to see if the large chars are a single line, in the
2870   // upper-left corner of blk (this is just a rough estimate)
2871   xLimit = blk->xMin + 0.5 * (blk->xMin + blk->xMax);
2872   yLimit = blk->yMin + 0.5 * (blk->yMin + blk->yMax);
2873   singleLine = gTrue;
2874   // note: largeChars are already sorted by x
2875   for (i = 0; i < largeChars->getLength(); ++i) {
2876     ch2 = (TextChar *)largeChars->get(i);
2877     if (ch2->xMax > xLimit || ch2->yMax > yLimit) {
2878       singleLine = gFalse;
2879       break;
2880     }
2881     if (i > 0) {
2882       ch = (TextChar *)largeChars->get(i-1);
2883       minOverlap = 0.5 * (ch->fontSize < ch2->fontSize ? ch->fontSize
2884 			                               : ch2->fontSize);
2885       if (ch->yMax - ch2->yMin < minOverlap ||
2886 	  ch2->yMax - ch->yMin < minOverlap) {
2887 	singleLine = gFalse;
2888 	break;
2889       }
2890     }
2891   }
2892 
2893   if (singleLine) {
2894     // if the large chars are a single line, prepend them to the first
2895     // leaf node in blk
2896     insertLargeCharsInFirstLeaf(largeChars, blk);
2897   } else {
2898     // if the large chars are not a single line, prepend each one to
2899     // the appropriate leaf node -- this handles cases like bullets
2900     // drawn in a large font, on the left edge of a column
2901     for (i = largeChars->getLength() - 1; i >= 0; --i) {
2902       ch = (TextChar *)largeChars->get(i);
2903       insertLargeCharInLeaf(ch, blk);
2904     }
2905   }
2906 }
2907 
2908 // Find the first leaf (in depth-first order) in blk, and prepend a
2909 // list of large chars.
insertLargeCharsInFirstLeaf(GList * largeChars,TextBlock * blk)2910 void TextPage::insertLargeCharsInFirstLeaf(GList *largeChars, TextBlock *blk) {
2911   TextChar *ch;
2912   int i;
2913 
2914   if (blk->type == blkLeaf) {
2915     for (i = largeChars->getLength() - 1; i >= 0; --i) {
2916       ch = (TextChar *)largeChars->get(i);
2917       blk->prependChild(ch);
2918     }
2919   } else {
2920     insertLargeCharsInFirstLeaf(largeChars, (TextBlock *)blk->children->get(0));
2921     blk->updateBounds(0);
2922   }
2923 }
2924 
2925 // Find the leaf in <blk> where large char <ch> belongs, and prepend
2926 // it.
insertLargeCharInLeaf(TextChar * ch,TextBlock * blk)2927 void TextPage::insertLargeCharInLeaf(TextChar *ch, TextBlock *blk) {
2928   TextBlock *child;
2929   double y;
2930   int i;
2931 
2932   //~ this currently works only for characters in the primary rotation
2933 
2934   //~ this currently just looks down the left edge of blk
2935   //~   -- it could be extended to do more
2936 
2937   // estimate the baseline of ch
2938   y = ch->yMin + 0.75 * (ch->yMax - ch->yMin);
2939 
2940   if (blk->type == blkLeaf) {
2941     blk->prependChild(ch);
2942   } else if (blk->type == blkHorizSplit) {
2943     for (i = 0; i < blk->children->getLength(); ++i) {
2944       child = (TextBlock *)blk->children->get(i);
2945       if (y < child->yMax || i == blk->children->getLength() - 1) {
2946 	insertLargeCharInLeaf(ch, child);
2947 	blk->updateBounds(i);
2948 	break;
2949       }
2950     }
2951   } else {
2952     insertLargeCharInLeaf(ch, (TextBlock *)blk->children->get(0));
2953     blk->updateBounds(0);
2954   }
2955 }
2956 
2957 // Merge blk (rot != 0) into primaryTree (rot == 0).
insertIntoTree(TextBlock * blk,TextBlock * primaryTree)2958 void TextPage::insertIntoTree(TextBlock *blk, TextBlock *primaryTree) {
2959   TextBlock *child;
2960 
2961   // we insert a whole column at a time - so call insertIntoTree
2962   // recursively until we get to a column (or line)
2963 
2964   if (blk->tag == blkTagMulticolumn) {
2965     while (blk->children->getLength()) {
2966       child = (TextBlock *)blk->children->del(0);
2967       insertIntoTree(child, primaryTree);
2968     }
2969     delete blk;
2970   } else {
2971     insertColumnIntoTree(blk, primaryTree);
2972   }
2973 }
2974 
2975 // Insert a column (as an atomic subtree) into tree.
2976 // Requirement: tree is not a leaf node.
insertColumnIntoTree(TextBlock * column,TextBlock * tree)2977 void TextPage::insertColumnIntoTree(TextBlock *column, TextBlock *tree) {
2978   TextBlock *child;
2979   int i;
2980 
2981   for (i = 0; i < tree->children->getLength(); ++i) {
2982     child = (TextBlock *)tree->children->get(i);
2983     if (child->tag == blkTagMulticolumn &&
2984 	column->xMin >= child->xMin &&
2985 	column->yMin >= child->yMin &&
2986 	column->xMax <= child->xMax &&
2987 	column->yMax <= child->yMax) {
2988       insertColumnIntoTree(column, child);
2989       tree->tag = blkTagMulticolumn;
2990       return;
2991     }
2992   }
2993 
2994   if (tree->type == blkVertSplit) {
2995     if (tree->rot == 1 || tree->rot == 2) {
2996       for (i = 0; i < tree->children->getLength(); ++i) {
2997 	child = (TextBlock *)tree->children->get(i);
2998 	if (column->xMax > 0.5 * (child->xMin + child->xMax)) {
2999 	  break;
3000 	}
3001       }
3002     } else {
3003       for (i = 0; i < tree->children->getLength(); ++i) {
3004 	child = (TextBlock *)tree->children->get(i);
3005 	if (column->xMin < 0.5 * (child->xMin + child->xMax)) {
3006 	  break;
3007 	}
3008       }
3009     }
3010   } else if (tree->type == blkHorizSplit) {
3011     if (tree->rot >= 2) {
3012       for (i = 0; i < tree->children->getLength(); ++i) {
3013 	child = (TextBlock *)tree->children->get(i);
3014 	if (column->yMax > 0.5 * (child->yMin + child->yMax)) {
3015 	  break;
3016 	}
3017       }
3018     } else {
3019       for (i = 0; i < tree->children->getLength(); ++i) {
3020 	child = (TextBlock *)tree->children->get(i);
3021 	if (column->yMin < 0.5 * (child->yMin + child->yMax)) {
3022 	  break;
3023 	}
3024       }
3025     }
3026   } else {
3027     // this should never happen
3028     return;
3029   }
3030   tree->children->insert(i, column);
3031   tree->tag = blkTagMulticolumn;
3032 }
3033 
3034 // Insert clipped characters back into the TextBlock tree.
insertClippedChars(GList * clippedChars,TextBlock * tree)3035 void TextPage::insertClippedChars(GList *clippedChars, TextBlock *tree) {
3036   TextChar *ch, *ch2;
3037   TextBlock *leaf;
3038   double y;
3039   int i;
3040 
3041   //~ this currently works only for characters in the primary rotation
3042 
3043   clippedChars->sort(TextChar::cmpX);
3044   while (clippedChars->getLength()) {
3045     ch = (TextChar *)clippedChars->del(0);
3046     if (ch->rot != 0) {
3047       continue;
3048     }
3049     if (!(leaf = findClippedCharLeaf(ch, tree))) {
3050       continue;
3051     }
3052     leaf->addChild(ch);
3053     i = 0;
3054     while (i < clippedChars->getLength()) {
3055       ch2 = (TextChar *)clippedChars->get(i);
3056       if (ch2->xMin > ch->xMax + clippedTextMaxWordSpace * ch->fontSize) {
3057 	break;
3058       }
3059       y = 0.5 * (ch2->yMin + ch2->yMax);
3060       if (y > leaf->yMin && y < leaf->yMax) {
3061 	ch2 = (TextChar *)clippedChars->del(i);
3062 	leaf->addChild(ch2);
3063 	ch = ch2;
3064       } else {
3065 	++i;
3066       }
3067     }
3068   }
3069 }
3070 
3071 // Find the leaf in <tree> to which clipped char <ch> can be appended.
3072 // Returns NULL if there is no appropriate append point.
findClippedCharLeaf(TextChar * ch,TextBlock * tree)3073 TextBlock *TextPage::findClippedCharLeaf(TextChar *ch, TextBlock *tree) {
3074   TextBlock *ret, *child;
3075   double y;
3076   int i;
3077 
3078   //~ this currently works only for characters in the primary rotation
3079 
3080   y = 0.5 * (ch->yMin + ch->yMax);
3081   if (tree->type == blkLeaf) {
3082     if (tree->rot == 0) {
3083       if (y > tree->yMin && y < tree->yMax &&
3084 	  ch->xMin <= tree->xMax + clippedTextMaxWordSpace * ch->fontSize) {
3085 	return tree;
3086       }
3087     }
3088   } else {
3089     for (i = 0; i < tree->children->getLength(); ++i) {
3090       child = (TextBlock *)tree->children->get(i);
3091       if ((ret = findClippedCharLeaf(ch, child))) {
3092 	return ret;
3093       }
3094     }
3095   }
3096   return NULL;
3097 }
3098 
3099 // Convert the tree of TextBlocks into a list of TextColumns.
buildColumns(TextBlock * tree)3100 GList *TextPage::buildColumns(TextBlock *tree) {
3101   GList *columns;
3102 
3103   columns = new GList();
3104   buildColumns2(tree, columns);
3105   return columns;
3106 }
3107 
buildColumns2(TextBlock * blk,GList * columns)3108 void TextPage::buildColumns2(TextBlock *blk, GList *columns) {
3109   TextColumn *col;
3110   int i;
3111 
3112   switch (blk->tag) {
3113   case blkTagLine:
3114   case blkTagColumn:
3115     col = buildColumn(blk);
3116     columns->append(col);
3117     break;
3118   case blkTagMulticolumn:
3119     for (i = 0; i < blk->children->getLength(); ++i) {
3120       buildColumns2((TextBlock *)blk->children->get(i), columns);
3121     }
3122     break;
3123   }
3124 }
3125 
buildColumn(TextBlock * blk)3126 TextColumn *TextPage::buildColumn(TextBlock *blk) {
3127   GList *lines, *parLines;
3128   GList *paragraphs;
3129   TextLine *line0, *line1;
3130   double spaceThresh, indent0, indent1, fontSize0, fontSize1;
3131   int i;
3132 
3133   lines = new GList();
3134   buildLines(blk, lines);
3135 
3136   spaceThresh = paragraphSpacingThreshold * getAverageLineSpacing(lines);
3137 
3138   //~ could look for bulleted lists here: look for the case where
3139   //~   all out-dented lines start with the same char
3140 
3141   // build the paragraphs
3142   paragraphs = new GList();
3143   i = 0;
3144   while (i < lines->getLength()) {
3145 
3146     // get the first line of the paragraph
3147     parLines = new GList();
3148     line0 = (TextLine *)lines->get(i);
3149     parLines->append(line0);
3150     ++i;
3151 
3152     if (i < lines->getLength()) {
3153       line1 = (TextLine *)lines->get(i);
3154       indent0 = getLineIndent(line0, blk);
3155       indent1 = getLineIndent(line1, blk);
3156       fontSize0 = line0->fontSize;
3157       fontSize1 = line1->fontSize;
3158 
3159       // inverted indent
3160       if (indent1 - indent0 > minParagraphIndent * fontSize0 &&
3161 	  fabs(fontSize0 - fontSize1) <= paragraphFontSizeDelta &&
3162 	  getLineSpacing(line0, line1) <= spaceThresh) {
3163 	parLines->append(line1);
3164 	indent0 = indent1;
3165 	for (++i; i < lines->getLength(); ++i) {
3166 	  line1 = (TextLine *)lines->get(i);
3167 	  indent1 = getLineIndent(line1, blk);
3168 	  fontSize1 = line1->fontSize;
3169 	  if (indent0 - indent1 > minParagraphIndent * fontSize0) {
3170 	    break;
3171 	  }
3172 	  if (fabs(fontSize0 - fontSize1) > paragraphFontSizeDelta) {
3173 	    break;
3174 	  }
3175 	  if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
3176 	        > spaceThresh) {
3177 	    break;
3178 	  }
3179 	  parLines->append(line1);
3180 	}
3181 
3182       // drop cap
3183       } else if (fontSize0 > largeCharThreshold * fontSize1 &&
3184 		 indent1 - indent0 > minParagraphIndent * fontSize1 &&
3185 		 getLineSpacing(line0, line1) < 0) {
3186 	parLines->append(line1);
3187 	fontSize0 = fontSize1;
3188 	for (++i; i < lines->getLength(); ++i) {
3189 	  line1 = (TextLine *)lines->get(i);
3190 	  indent1 = getLineIndent(line1, blk);
3191 	  if (indent1 - indent0 <= minParagraphIndent * fontSize0) {
3192 	    break;
3193 	  }
3194 	  if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
3195 	        > spaceThresh) {
3196 	    break;
3197 	  }
3198 	  parLines->append(line1);
3199 	}
3200 	for (; i < lines->getLength(); ++i) {
3201 	  line1 = (TextLine *)lines->get(i);
3202 	  indent1 = getLineIndent(line1, blk);
3203 	  fontSize1 = line1->fontSize;
3204 	  if (indent1 - indent0 > minParagraphIndent * fontSize0) {
3205 	    break;
3206 	  }
3207 	  if (fabs(fontSize0 - fontSize1) > paragraphFontSizeDelta) {
3208 	    break;
3209 	  }
3210 	  if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
3211 	        > spaceThresh) {
3212 	    break;
3213 	  }
3214 	  parLines->append(line1);
3215 	}
3216 
3217       // regular indent or no indent
3218       } else if (fabs(fontSize0 - fontSize1) <= paragraphFontSizeDelta &&
3219 		 getLineSpacing(line0, line1) <= spaceThresh) {
3220 	parLines->append(line1);
3221 	indent0 = indent1;
3222 	for (++i; i < lines->getLength(); ++i) {
3223 	  line1 = (TextLine *)lines->get(i);
3224 	  indent1 = getLineIndent(line1, blk);
3225 	  fontSize1 = line1->fontSize;
3226 	  if (indent1 - indent0 > minParagraphIndent * fontSize0) {
3227 	    break;
3228 	  }
3229 	  if (fabs(fontSize0 - fontSize1) > paragraphFontSizeDelta) {
3230 	    break;
3231 	  }
3232 	  if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
3233 	        > spaceThresh) {
3234 	    break;
3235 	  }
3236 	  parLines->append(line1);
3237 	}
3238       }
3239     }
3240 
3241     paragraphs->append(new TextParagraph(parLines));
3242   }
3243 
3244   delete lines;
3245 
3246   return new TextColumn(paragraphs, blk->xMin, blk->yMin,
3247 			blk->xMax, blk->yMax);
3248 }
3249 
getLineIndent(TextLine * line,TextBlock * blk)3250 double TextPage::getLineIndent(TextLine *line, TextBlock *blk) {
3251   double indent;
3252 
3253   switch (line->rot) {
3254   case 0:
3255   default: indent = line->xMin - blk->xMin;  break;
3256   case 1:  indent = line->yMin - blk->yMin;  break;
3257   case 2:  indent = blk->xMax  - line->xMax; break;
3258   case 3:  indent = blk->yMax  - line->yMax; break;
3259   }
3260   return indent;
3261 }
3262 
3263 // Compute average line spacing in column.
getAverageLineSpacing(GList * lines)3264 double TextPage::getAverageLineSpacing(GList *lines) {
3265   double avg, sp;
3266   int n, i;
3267 
3268   avg = 0;
3269   n = 0;
3270   for (i = 1; i < lines->getLength(); ++i) {
3271     sp = getLineSpacing((TextLine *)lines->get(i - 1),
3272 			(TextLine *)lines->get(i));
3273     if (sp > 0) {
3274       avg += sp;
3275       ++n;
3276     }
3277   }
3278   if (n > 0) {
3279     avg /= n;
3280   }
3281   return avg;
3282 }
3283 
3284 // Compute the space between two lines.
getLineSpacing(TextLine * line0,TextLine * line1)3285 double TextPage::getLineSpacing(TextLine *line0, TextLine *line1) {
3286   double sp;
3287 
3288   switch (line0->rot) {
3289   case 0:
3290   default: sp = line1->yMin - line0->yMax; break;
3291   case 1:  sp = line0->xMin - line1->xMax; break;
3292   case 2:  sp = line0->yMin - line1->yMin; break;
3293   case 3:  sp = line1->xMin - line1->xMax; break;
3294   }
3295   return sp;
3296 }
3297 
buildLines(TextBlock * blk,GList * lines)3298 void TextPage::buildLines(TextBlock *blk, GList *lines) {
3299   TextLine *line;
3300   int i;
3301 
3302   switch (blk->tag) {
3303   case blkTagLine:
3304     line = buildLine(blk);
3305     if (blk->rot == 1 || blk->rot == 2) {
3306       lines->insert(0, line);
3307     } else {
3308       lines->append(line);
3309     }
3310     break;
3311   case blkTagColumn:
3312   case blkTagMulticolumn: // multicolumn should never happen here
3313     for (i = 0; i < blk->children->getLength(); ++i) {
3314       buildLines((TextBlock *)blk->children->get(i), lines);
3315     }
3316     break;
3317   }
3318 }
3319 
buildLine(TextBlock * blk)3320 TextLine *TextPage::buildLine(TextBlock *blk) {
3321   GList *charsA;
3322   GList *words;
3323   TextChar *ch, *ch2;
3324   TextWord *word;
3325   double wordSp, lineFontSize, sp;
3326   GBool spaceAfter, spaceAfter2;
3327   int i, j;
3328 
3329   charsA = new GList();
3330   getLineChars(blk, charsA);
3331 
3332   wordSp = computeWordSpacingThreshold(charsA, blk->rot);
3333 
3334   words = new GList();
3335   lineFontSize = 0;
3336   spaceAfter = gFalse;
3337   i = 0;
3338   while (i < charsA->getLength()) {
3339     sp = wordSp - 1;
3340     for (j = i+1; j < charsA->getLength(); ++j) {
3341       ch = (TextChar *)charsA->get(j-1);
3342       ch2 = (TextChar *)charsA->get(j);
3343       sp = (blk->rot & 1) ? (ch2->yMin - ch->yMax) : (ch2->xMin - ch->xMax);
3344       if (sp > wordSp ||
3345 	  ch->font != ch2->font ||
3346 	  fabs(ch->fontSize - ch2->fontSize) > 0.01 ||
3347 	  (control.mode == textOutRawOrder &&
3348 	   ch2->charPos != ch->charPos + ch->charLen)) {
3349 	break;
3350       }
3351       sp = wordSp - 1;
3352     }
3353     spaceAfter2 = spaceAfter;
3354     spaceAfter = sp > wordSp;
3355     word = new TextWord(charsA, i, j - i, blk->rot,
3356 			(blk->rot >= 2) ? spaceAfter2 : spaceAfter);
3357     i = j;
3358     if (blk->rot >= 2) {
3359       words->insert(0, word);
3360     } else {
3361       words->append(word);
3362     }
3363     if (i == 0 || word->fontSize > lineFontSize) {
3364       lineFontSize = word->fontSize;
3365     }
3366   }
3367 
3368   delete charsA;
3369 
3370   return new TextLine(words, blk->xMin, blk->yMin, blk->xMax, blk->yMax,
3371 		      lineFontSize);
3372 }
3373 
getLineChars(TextBlock * blk,GList * charsA)3374 void TextPage::getLineChars(TextBlock *blk, GList *charsA) {
3375   int i;
3376 
3377   if (blk->type == blkLeaf) {
3378     charsA->append(blk->children);
3379   } else {
3380     for (i = 0; i < blk->children->getLength(); ++i) {
3381       getLineChars((TextBlock *)blk->children->get(i), charsA);
3382     }
3383   }
3384 }
3385 
3386 // Compute the inter-word spacing threshold for a line of chars.
3387 // Spaces greater than this threshold will be considered inter-word
3388 // spaces.
computeWordSpacingThreshold(GList * charsA,int rot)3389 double TextPage::computeWordSpacingThreshold(GList *charsA, int rot) {
3390   TextChar *ch, *ch2;
3391   double avgFontSize, minSp, maxSp, sp;
3392   int i;
3393 
3394   avgFontSize = 0;
3395   minSp = maxSp = 0;
3396   for (i = 0; i < charsA->getLength(); ++i) {
3397     ch = (TextChar *)charsA->get(i);
3398     avgFontSize += ch->fontSize;
3399     if (i < charsA->getLength() - 1) {
3400       ch2 = (TextChar *)charsA->get(i+1);
3401       sp = (rot & 1) ? (ch2->yMin - ch->yMax) : (ch2->xMin - ch->xMax);
3402       if (i == 0 || sp < minSp) {
3403 	minSp = sp;
3404       }
3405       if (sp > maxSp) {
3406 	maxSp = sp;
3407       }
3408     }
3409   }
3410   avgFontSize /= charsA->getLength();
3411   if (minSp < 0) {
3412     minSp = 0;
3413   }
3414 
3415   // if spacing is completely uniform, assume it's a single word
3416   // (technically it could be either "ABC" or "A B C", but it's
3417   // essentially impossible to tell)
3418   if (maxSp - minSp < uniformSpacing * avgFontSize) {
3419     return maxSp + 1;
3420 
3421   // if there is some variation in spacing, but it's small, assume
3422   // there are some inter-word spaces
3423   } else if (maxSp - minSp < wordSpacing * avgFontSize) {
3424     return 0.5 * (minSp + maxSp);
3425 
3426   // otherwise, assume a reasonable threshold for inter-word spacing
3427   // (we can't use something like 0.5*(minSp+maxSp) here because there
3428   // can be outliers at the high end)
3429   } else {
3430     return minSp + wordSpacing * avgFontSize;
3431   }
3432 }
3433 
assignPhysLayoutPositions(GList * columns)3434 int TextPage::assignPhysLayoutPositions(GList *columns) {
3435   assignLinePhysPositions(columns);
3436   return assignColumnPhysPositions(columns);
3437 }
3438 
3439 // Assign a physical x coordinate for each TextLine (relative to the
3440 // containing TextColumn).  This also computes TextColumn width and
3441 // height.
assignLinePhysPositions(GList * columns)3442 void TextPage::assignLinePhysPositions(GList *columns) {
3443   TextColumn *col;
3444   TextParagraph *par;
3445   TextLine *line;
3446   UnicodeMap *uMap;
3447   int colIdx, parIdx, lineIdx;
3448 
3449   if (!(uMap = globalParams->getTextEncoding())) {
3450     return;
3451   }
3452 
3453   for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
3454     col = (TextColumn *)columns->get(colIdx);
3455     col->pw = col->ph = 0;
3456     for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
3457       par = (TextParagraph *)col->paragraphs->get(parIdx);
3458       for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
3459 	line = (TextLine *)par->lines->get(lineIdx);
3460 	computeLinePhysWidth(line, uMap);
3461 	if (control.fixedPitch > 0) {
3462 	  line->px = (int)((line->xMin - col->xMin) / control.fixedPitch);
3463 	} else if (fabs(line->fontSize) < 0.001) {
3464 	  line->px = 0;
3465 	} else {
3466 	  line->px = (int)((line->xMin - col->xMin) /
3467 			   (physLayoutSpaceWidth * line->fontSize));
3468 	}
3469 	if (line->px + line->pw > col->pw) {
3470 	  col->pw = line->px + line->pw;
3471 	}
3472       }
3473       col->ph += par->lines->getLength();
3474     }
3475     col->ph += col->paragraphs->getLength() - 1;
3476   }
3477 
3478   uMap->decRefCnt();
3479 }
3480 
computeLinePhysWidth(TextLine * line,UnicodeMap * uMap)3481 void TextPage::computeLinePhysWidth(TextLine *line, UnicodeMap *uMap) {
3482   char buf[8];
3483   int n, i;
3484 
3485   if (uMap->isUnicode()) {
3486     line->pw = line->len;
3487   } else {
3488     line->pw = 0;
3489     for (i = 0; i < line->len; ++i) {
3490       n = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
3491       line->pw += n;
3492     }
3493   }
3494 }
3495 
3496 // Assign physical x and y coordinates for each TextColumn.  Returns
3497 // the text height (max physical y + 1).
assignColumnPhysPositions(GList * columns)3498 int TextPage::assignColumnPhysPositions(GList *columns) {
3499   TextColumn *col, *col2;
3500   double slack, xOverlap, yOverlap;
3501   int ph, i, j;
3502 
3503   if (control.mode == textOutTableLayout) {
3504     slack = tableCellOverlapSlack;
3505   } else {
3506     slack = 0;
3507   }
3508 
3509   // assign x positions
3510   columns->sort(&TextColumn::cmpX);
3511   for (i = 0; i < columns->getLength(); ++i) {
3512     col = (TextColumn *)columns->get(i);
3513     if (control.fixedPitch) {
3514       col->px = (int)(col->xMin / control.fixedPitch);
3515     } else {
3516       col->px = 0;
3517       for (j = 0; j < i; ++j) {
3518 	col2 = (TextColumn *)columns->get(j);
3519 	xOverlap = col2->xMax - col->xMin;
3520 	if (xOverlap < slack * (col2->xMax - col2->xMin)) {
3521 	  if (col2->px + col2->pw + 2 > col->px) {
3522 	    col->px = col2->px + col2->pw + 2;
3523 	  }
3524 	} else {
3525 	  yOverlap = (col->yMax < col2->yMax ? col->yMax : col2->yMax) -
3526 	             (col->yMin > col2->yMin ? col->yMin : col2->yMin);
3527 	  if (yOverlap > 0 && xOverlap < yOverlap) {
3528 	    if (col2->px + col2->pw > col->px) {
3529 	      col->px = col2->px + col2->pw;
3530 	    }
3531 	  } else {
3532 	    if (col2->px > col->px) {
3533 	      col->px = col2->px;
3534 	    }
3535 	  }
3536 	}
3537       }
3538     }
3539   }
3540 
3541   // assign y positions
3542   ph = 0;
3543   columns->sort(&TextColumn::cmpY);
3544   for (i = 0; i < columns->getLength(); ++i) {
3545     col = (TextColumn *)columns->get(i);
3546     col->py = 0;
3547     for (j = 0; j < i; ++j) {
3548       col2 = (TextColumn *)columns->get(j);
3549       yOverlap = col2->yMax - col->yMin;
3550       if (yOverlap < slack * (col2->yMax - col2->yMin)) {
3551 	if (col2->py + col2->ph + 1 > col->py) {
3552 	  col->py = col2->py + col2->ph + 1;
3553 	}
3554       } else {
3555 	xOverlap = (col->xMax < col2->xMax ? col->xMax : col2->xMax) -
3556 	           (col->xMin > col2->xMin ? col->xMin : col2->xMin);
3557 	if (xOverlap > 0 && yOverlap < xOverlap) {
3558 	  if (col2->py + col2->ph > col->py) {
3559 	    col->py = col2->py + col2->ph;
3560 	  }
3561 	} else {
3562 	  if (col2->py > col->py) {
3563 	    col->py = col2->py;
3564 	  }
3565 	}
3566       }
3567     }
3568     if (col->py + col->ph > ph) {
3569       ph = col->py + col->ph;
3570     }
3571   }
3572 
3573   return ph;
3574 }
3575 
generateUnderlinesAndLinks(GList * columns)3576 void TextPage::generateUnderlinesAndLinks(GList *columns) {
3577   TextColumn *col;
3578   TextParagraph *par;
3579   TextLine *line;
3580   TextWord *word;
3581   TextUnderline *underline;
3582   TextLink *link;
3583   double base, uSlack, ubSlack, hSlack;
3584   int colIdx, parIdx, lineIdx, wordIdx, i;
3585 
3586   for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
3587     col = (TextColumn *)columns->get(colIdx);
3588     for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
3589       par = (TextParagraph *)col->paragraphs->get(parIdx);
3590       for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
3591 	line = (TextLine *)par->lines->get(lineIdx);
3592 	for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
3593 	  word = (TextWord *)line->words->get(wordIdx);
3594 	  base = word->getBaseline();
3595 	  uSlack = underlineSlack * word->fontSize;
3596 	  ubSlack = underlineBaselineSlack * word->fontSize;
3597 	  hSlack = hyperlinkSlack * word->fontSize;
3598 
3599 	  //----- handle underlining
3600 	  for (i = 0; i < underlines->getLength(); ++i) {
3601 	    underline = (TextUnderline *)underlines->get(i);
3602 	    if (underline->horiz) {
3603 	      if (word->rot == 0 || word->rot == 2) {
3604 		if (fabs(underline->y0 - base) < ubSlack &&
3605 		    underline->x0 < word->xMin + uSlack &&
3606 		    word->xMax - uSlack < underline->x1) {
3607 		  word->underlined = gTrue;
3608 		}
3609 	      }
3610 	    } else {
3611 	      if (word->rot == 1 || word->rot == 3) {
3612 		if (fabs(underline->x0 - base) < ubSlack &&
3613 		    underline->y0 < word->yMin + uSlack &&
3614 		    word->yMax - uSlack < underline->y1) {
3615 		  word->underlined = gTrue;
3616 		}
3617 	      }
3618 	    }
3619 	  }
3620 
3621 	  //----- handle links
3622 	  for (i = 0; i < links->getLength(); ++i) {
3623 	    link = (TextLink *)links->get(i);
3624 	    if (link->xMin < word->xMin + hSlack &&
3625 		word->xMax - hSlack < link->xMax &&
3626 		link->yMin < word->yMin + hSlack &&
3627 		word->yMax - hSlack < link->yMax) {
3628 	      word->link = link;
3629 	    }
3630 	  }
3631 	}
3632       }
3633     }
3634   }
3635 }
3636 
3637 //------------------------------------------------------------------------
3638 // TextPage: access
3639 //------------------------------------------------------------------------
3640 
findText(Unicode * s,int len,GBool startAtTop,GBool stopAtBottom,GBool startAtLast,GBool stopAtLast,GBool caseSensitive,GBool backward,GBool wholeWord,double * xMin,double * yMin,double * xMax,double * yMax)3641 GBool TextPage::findText(Unicode *s, int len,
3642 			 GBool startAtTop, GBool stopAtBottom,
3643 			 GBool startAtLast, GBool stopAtLast,
3644 			 GBool caseSensitive, GBool backward,
3645 			 GBool wholeWord,
3646 			 double *xMin, double *yMin,
3647 			 double *xMax, double *yMax) {
3648   TextBlock *tree;
3649   TextColumn *column;
3650   TextParagraph *par;
3651   TextLine *line;
3652   Unicode *s2, *txt;
3653   Unicode *p;
3654   double xStart, yStart, xStop, yStop;
3655   double xMin0, yMin0, xMax0, yMax0;
3656   double xMin1, yMin1, xMax1, yMax1;
3657   GBool found;
3658   int txtSize, m, rot, colIdx, parIdx, lineIdx, i, j, k;
3659 
3660   //~ need to handle right-to-left text
3661 
3662   if (!findCols) {
3663     rot = rotateChars(chars);
3664     if ((tree = splitChars(chars))) {
3665       findCols = buildColumns(tree);
3666       delete tree;
3667     } else {
3668       // no text
3669       findCols = new GList();
3670     }
3671     unrotateChars(chars, rot);
3672     unrotateColumns(findCols, rot);
3673   }
3674 
3675   // convert the search string to uppercase
3676   if (!caseSensitive) {
3677     s2 = (Unicode *)gmallocn(len, sizeof(Unicode));
3678     for (i = 0; i < len; ++i) {
3679       s2[i] = unicodeToUpper(s[i]);
3680     }
3681   } else {
3682     s2 = s;
3683   }
3684 
3685   txt = NULL;
3686   txtSize = 0;
3687 
3688   xStart = yStart = xStop = yStop = 0;
3689   if (startAtLast && haveLastFind) {
3690     xStart = lastFindXMin;
3691     yStart = lastFindYMin;
3692   } else if (!startAtTop) {
3693     xStart = *xMin;
3694     yStart = *yMin;
3695   }
3696   if (stopAtLast && haveLastFind) {
3697     xStop = lastFindXMin;
3698     yStop = lastFindYMin;
3699   } else if (!stopAtBottom) {
3700     xStop = *xMax;
3701     yStop = *yMax;
3702   }
3703 
3704   found = gFalse;
3705   xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3706   xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3707 
3708   for (colIdx = backward ? findCols->getLength() - 1 : 0;
3709        backward ? colIdx >= 0 : colIdx < findCols->getLength();
3710        colIdx += backward ? -1 : 1) {
3711     column = (TextColumn *)findCols->get(colIdx);
3712 
3713     // check: is the column above the top limit?
3714     if (!startAtTop && (backward ? column->yMin > yStart
3715 			         : column->yMax < yStart)) {
3716       continue;
3717     }
3718 
3719     // check: is the column below the bottom limit?
3720     if (!stopAtBottom && (backward ? column->yMax < yStop
3721 			           : column->yMin > yStop)) {
3722       continue;
3723     }
3724 
3725     for (parIdx = backward ? column->paragraphs->getLength() - 1 : 0;
3726 	 backward ? parIdx >= 0 : parIdx < column->paragraphs->getLength();
3727 	 parIdx += backward ? -1 : 1) {
3728       par = (TextParagraph *)column->paragraphs->get(parIdx);
3729 
3730       // check: is the paragraph above the top limit?
3731       if (!startAtTop && (backward ? par->yMin > yStart
3732 			           : par->yMax < yStart)) {
3733 	continue;
3734       }
3735 
3736       // check: is the paragraph below the bottom limit?
3737       if (!stopAtBottom && (backward ? par->yMax < yStop
3738 			             : par->yMin > yStop)) {
3739 	continue;
3740       }
3741 
3742       for (lineIdx = backward ? par->lines->getLength() - 1 : 0;
3743 	   backward ? lineIdx >= 0 : lineIdx < par->lines->getLength();
3744 	   lineIdx += backward ? -1 : 1) {
3745 	line = (TextLine *)par->lines->get(lineIdx);
3746 
3747 	// check: is the line above the top limit?
3748 	if (!startAtTop && (backward ? line->yMin > yStart
3749 			             : line->yMax < yStart)) {
3750 	  continue;
3751 	}
3752 
3753 	// check: is the line below the bottom limit?
3754 	if (!stopAtBottom && (backward ? line->yMax < yStop
3755 			               : line->yMin > yStop)) {
3756 	  continue;
3757 	}
3758 
3759 	// convert the line to uppercase
3760 	m = line->len;
3761 	if (!caseSensitive) {
3762 	  if (m > txtSize) {
3763 	    txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
3764 	    txtSize = m;
3765 	  }
3766 	  for (k = 0; k < m; ++k) {
3767 	    txt[k] = unicodeToUpper(line->text[k]);
3768 	  }
3769 	} else {
3770 	  txt = line->text;
3771 	}
3772 
3773 	// search each position in this line
3774 	j = backward ? m - len : 0;
3775 	p = txt + j;
3776 	while (backward ? j >= 0 : j <= m - len) {
3777 	  if (!wholeWord ||
3778 	      ((j == 0 || !unicodeTypeWord(txt[j - 1])) &&
3779 	       (j + len == m || !unicodeTypeWord(txt[j + len])))) {
3780 
3781 	    // compare the strings
3782 	    for (k = 0; k < len; ++k) {
3783 	      if (p[k] != s2[k]) {
3784 		break;
3785 	      }
3786 	    }
3787 
3788 	    // found it
3789 	    if (k == len) {
3790 	      switch (line->rot) {
3791 	      case 0:
3792 		xMin1 = line->edge[j];
3793 		xMax1 = line->edge[j + len];
3794 		yMin1 = line->yMin;
3795 		yMax1 = line->yMax;
3796 		break;
3797 	      case 1:
3798 		xMin1 = line->xMin;
3799 		xMax1 = line->xMax;
3800 		yMin1 = line->edge[j];
3801 		yMax1 = line->edge[j + len];
3802 		break;
3803 	      case 2:
3804 		xMin1 = line->edge[j + len];
3805 		xMax1 = line->edge[j];
3806 		yMin1 = line->yMin;
3807 		yMax1 = line->yMax;
3808 		break;
3809 	      case 3:
3810 		xMin1 = line->xMin;
3811 		xMax1 = line->xMax;
3812 		yMin1 = line->edge[j + len];
3813 		yMax1 = line->edge[j];
3814 		break;
3815 	      }
3816 	      if (backward) {
3817 		if ((startAtTop ||
3818 		     yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) &&
3819 		    (stopAtBottom ||
3820 		     yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
3821 		  if (!found ||
3822 		      yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
3823 		    xMin0 = xMin1;
3824 		    xMax0 = xMax1;
3825 		    yMin0 = yMin1;
3826 		    yMax0 = yMax1;
3827 		    found = gTrue;
3828 		  }
3829 		}
3830 	      } else {
3831 		if ((startAtTop ||
3832 		     yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
3833 		    (stopAtBottom ||
3834 		     yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
3835 		  if (!found ||
3836 		      yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
3837 		    xMin0 = xMin1;
3838 		    xMax0 = xMax1;
3839 		    yMin0 = yMin1;
3840 		    yMax0 = yMax1;
3841 		    found = gTrue;
3842 		  }
3843 		}
3844 	      }
3845 	    }
3846 	  }
3847 	  if (backward) {
3848 	    --j;
3849 	    --p;
3850 	  } else {
3851 	    ++j;
3852 	    ++p;
3853 	  }
3854 	}
3855       }
3856     }
3857   }
3858 
3859   if (!caseSensitive) {
3860     gfree(s2);
3861     gfree(txt);
3862   }
3863 
3864   if (found) {
3865     *xMin = xMin0;
3866     *xMax = xMax0;
3867     *yMin = yMin0;
3868     *yMax = yMax0;
3869     lastFindXMin = xMin0;
3870     lastFindYMin = yMin0;
3871     haveLastFind = gTrue;
3872     return gTrue;
3873   }
3874 
3875   return gFalse;
3876 }
3877 
getText(double xMin,double yMin,double xMax,double yMax)3878 GString *TextPage::getText(double xMin, double yMin,
3879 			   double xMax, double yMax) {
3880   UnicodeMap *uMap;
3881   char space[8], eol[16];
3882   int spaceLen, eolLen;
3883   GList *chars2;
3884   GString **out;
3885   int *outLen;
3886   TextColumn *col;
3887   TextParagraph *par;
3888   TextLine *line;
3889   TextChar *ch;
3890   GBool primaryLR;
3891   TextBlock *tree;
3892   GList *columns;
3893   GString *ret;
3894   double xx, yy;
3895   int rot, colIdx, parIdx, lineIdx, ph, y, i;
3896 
3897   // get the output encoding
3898   if (!(uMap = globalParams->getTextEncoding())) {
3899     return NULL;
3900   }
3901   spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3902   eolLen = 0; // make gcc happy
3903   switch (globalParams->getTextEOL()) {
3904   case eolUnix:
3905     eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3906     break;
3907   case eolDOS:
3908     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3909     eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3910     break;
3911   case eolMac:
3912     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3913     break;
3914   }
3915 
3916   // get all chars in the rectangle
3917   // (i.e., all chars whose center lies inside the rectangle)
3918   chars2 = new GList();
3919   for (i = 0; i < chars->getLength(); ++i) {
3920     ch = (TextChar *)chars->get(i);
3921     xx = 0.5 * (ch->xMin + ch->xMax);
3922     yy = 0.5 * (ch->yMin + ch->yMax);
3923     if (xx > xMin && xx < xMax && yy > yMin && yy < yMax) {
3924       chars2->append(ch);
3925     }
3926   }
3927 #if 0 //~debug
3928   dumpChars(chars2);
3929 #endif
3930 
3931   rot = rotateChars(chars2);
3932   primaryLR = checkPrimaryLR(chars2);
3933   tree = splitChars(chars2);
3934   if (!tree) {
3935     unrotateChars(chars2, rot);
3936     delete chars2;
3937     return new GString();
3938   }
3939 #if 0 //~debug
3940   dumpTree(tree);
3941 #endif
3942   columns = buildColumns(tree);
3943   delete tree;
3944   ph = assignPhysLayoutPositions(columns);
3945 #if 0 //~debug
3946   dumpColumns(columns);
3947 #endif
3948   unrotateChars(chars2, rot);
3949   delete chars2;
3950 
3951   out = (GString **)gmallocn(ph, sizeof(GString *));
3952   outLen = (int *)gmallocn(ph, sizeof(int));
3953   for (i = 0; i < ph; ++i) {
3954     out[i] = NULL;
3955     outLen[i] = 0;
3956   }
3957 
3958   columns->sort(&TextColumn::cmpPX);
3959   for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
3960     col = (TextColumn *)columns->get(colIdx);
3961     y = col->py;
3962     for (parIdx = 0;
3963 	 parIdx < col->paragraphs->getLength() && y < ph;
3964 	 ++parIdx) {
3965       par = (TextParagraph *)col->paragraphs->get(parIdx);
3966       for (lineIdx = 0;
3967 	   lineIdx < par->lines->getLength() && y < ph;
3968 	   ++lineIdx) {
3969 	line = (TextLine *)par->lines->get(lineIdx);
3970 	if (!out[y]) {
3971 	  out[y] = new GString();
3972 	}
3973 	while (outLen[y] < col->px + line->px) {
3974 	  out[y]->append(space, spaceLen);
3975 	  ++outLen[y];
3976 	}
3977 	encodeFragment(line->text, line->len, uMap, primaryLR, out[y]);
3978 	outLen[y] += line->pw;
3979 	++y;
3980       }
3981       if (parIdx + 1 < col->paragraphs->getLength()) {
3982 	++y;
3983       }
3984     }
3985   }
3986 
3987   ret = new GString();
3988   for (i = 0; i < ph; ++i) {
3989     if (out[i]) {
3990       ret->append(out[i]);
3991       delete out[i];
3992     }
3993     if (ph > 1) {
3994       ret->append(eol, eolLen);
3995     }
3996   }
3997 
3998   gfree(out);
3999   gfree(outLen);
4000   deleteGList(columns, TextColumn);
4001   uMap->decRefCnt();
4002 
4003   return ret;
4004 }
4005 
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax)4006 GBool TextPage::findCharRange(int pos, int length,
4007 			      double *xMin, double *yMin,
4008 			      double *xMax, double *yMax) {
4009   TextChar *ch;
4010   double xMin2, yMin2, xMax2, yMax2;
4011   GBool first;
4012   int i;
4013 
4014   //~ this doesn't correctly handle ranges split across multiple lines
4015   //~ (the highlighted region is the bounding box of all the parts of
4016   //~ the range)
4017 
4018   xMin2 = yMin2 = xMax2 = yMax2 = 0;
4019   first = gTrue;
4020   for (i = 0; i < chars->getLength(); ++i) {
4021     ch = (TextChar *)chars->get(i);
4022     if (ch->charPos >= pos && ch->charPos < pos + length) {
4023       if (first || ch->xMin < xMin2) {
4024 	xMin2 = ch->xMin;
4025       }
4026       if (first || ch->yMin < yMin2) {
4027 	yMin2 = ch->yMin;
4028       }
4029       if (first || ch->xMax > xMax2) {
4030 	xMax2 = ch->xMax;
4031       }
4032       if (first || ch->yMax > yMax2) {
4033 	yMax2 = ch->yMax;
4034       }
4035       first = gFalse;
4036     }
4037   }
4038   if (first) {
4039     return gFalse;
4040   }
4041   *xMin = xMin2;
4042   *yMin = yMin2;
4043   *xMax = xMax2;
4044   *yMax = yMax2;
4045   return gTrue;
4046 }
4047 
makeWordList()4048 TextWordList *TextPage::makeWordList() {
4049   TextBlock *tree;
4050   GList *columns;
4051   TextColumn *col;
4052   TextParagraph *par;
4053   TextLine *line;
4054   TextWord *word;
4055   GList *words;
4056   int rot, colIdx, parIdx, lineIdx, wordIdx;
4057 
4058   rot = rotateChars(chars);
4059   tree = splitChars(chars);
4060   if (!tree) {
4061     // no text
4062     unrotateChars(chars, rot);
4063     return new TextWordList(new GList());
4064   }
4065   columns = buildColumns(tree);
4066   delete tree;
4067   unrotateChars(chars, rot);
4068   if (control.html) {
4069     rotateUnderlinesAndLinks(rot);
4070     generateUnderlinesAndLinks(columns);
4071   }
4072 
4073   words = new GList();
4074   for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
4075     col = (TextColumn *)columns->get(colIdx);
4076     for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
4077       par = (TextParagraph *)col->paragraphs->get(parIdx);
4078       for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
4079 	line = (TextLine *)par->lines->get(lineIdx);
4080 	for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
4081 	  word = (TextWord *)line->words->get(wordIdx);
4082 	  words->append(word->copy());
4083 	}
4084       }
4085     }
4086   }
4087 
4088   switch (control.mode) {
4089   case textOutReadingOrder:
4090     // already in reading order
4091     break;
4092   case textOutPhysLayout:
4093   case textOutTableLayout:
4094   case textOutLinePrinter:
4095     words->sort(&TextWord::cmpYX);
4096     break;
4097   case textOutRawOrder:
4098     words->sort(&TextWord::cmpCharPos);
4099     break;
4100   }
4101 
4102   // this has to be done after sorting with cmpYX
4103   unrotateColumns(columns, rot);
4104   unrotateWords(words, rot);
4105 
4106   deleteGList(columns, TextColumn);
4107 
4108   return new TextWordList(words);
4109 }
4110 
4111 //------------------------------------------------------------------------
4112 // TextPage: debug
4113 //------------------------------------------------------------------------
4114 
4115 #if 0 //~debug
4116 
4117 void TextPage::dumpChars(GList *charsA) {
4118   TextChar *ch;
4119   int i;
4120 
4121   for (i = 0; i < charsA->getLength(); ++i) {
4122     ch = (TextChar *)charsA->get(i);
4123     printf("char: U+%04x '%c' xMin=%g yMin=%g xMax=%g yMax=%g fontSize=%g rot=%d\n",
4124 	   ch->c, ch->c & 0xff, ch->xMin, ch->yMin, ch->xMax, ch->yMax,
4125 	   ch->fontSize, ch->rot);
4126   }
4127 }
4128 
4129 void TextPage::dumpTree(TextBlock *tree, int indent) {
4130   TextChar *ch;
4131   int i;
4132 
4133   printf("%*sblock: type=%s tag=%s small=%d rot=%d xMin=%g yMin=%g xMax=%g yMax=%g\n",
4134 	 indent, "",
4135 	 tree->type == blkLeaf ? "leaf" :
4136 	                 tree->type == blkHorizSplit ? "horiz" : "vert",
4137 	 tree->tag == blkTagMulticolumn ? "multicolumn" :
4138 	                tree->tag == blkTagColumn ? "column" : "line",
4139 	 tree->smallSplit,
4140 	 tree->rot, tree->xMin, tree->yMin, tree->xMax, tree->yMax);
4141   if (tree->type == blkLeaf) {
4142     for (i = 0; i < tree->children->getLength(); ++i) {
4143       ch = (TextChar *)tree->children->get(i);
4144       printf("%*schar: '%c' xMin=%g yMin=%g xMax=%g yMax=%g font=%d.%d\n",
4145 	     indent + 2, "", ch->c & 0xff,
4146 	     ch->xMin, ch->yMin, ch->xMax, ch->yMax,
4147 	     ch->font->fontID.num, ch->font->fontID.gen);
4148     }
4149   } else {
4150     for (i = 0; i < tree->children->getLength(); ++i) {
4151       dumpTree((TextBlock *)tree->children->get(i), indent + 2);
4152     }
4153   }
4154 }
4155 
4156 void TextPage::dumpColumns(GList *columns) {
4157   TextColumn *col;
4158   TextParagraph *par;
4159   TextLine *line;
4160   int colIdx, parIdx, lineIdx, i;
4161 
4162   for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
4163     col = (TextColumn *)columns->get(colIdx);
4164     printf("column: xMin=%g yMin=%g xMax=%g yMax=%g px=%d py=%d pw=%d ph=%d\n",
4165 	   col->xMin, col->yMin, col->xMax, col->yMax,
4166 	   col->px, col->py, col->pw, col->ph);
4167     for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
4168       par = (TextParagraph *)col->paragraphs->get(parIdx);
4169       printf("  paragraph:\n");
4170       for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
4171 	line = (TextLine *)par->lines->get(lineIdx);
4172 	printf("    line: xMin=%g yMin=%g xMax=%g yMax=%g px=%d pw=%d rot=%d\n",
4173 	       line->xMin, line->yMin, line->xMax, line->yMax,
4174 	       line->px, line->pw, line->rot);
4175 	printf("          ");
4176 	for (i = 0; i < line->len; ++i) {
4177 	  printf("%c", line->text[i] & 0xff);
4178 	}
4179 	printf("\n");
4180       }
4181     }
4182   }
4183 }
4184 
4185 #endif //~debug
4186 
4187 //------------------------------------------------------------------------
4188 // TextOutputDev
4189 //------------------------------------------------------------------------
4190 
outputToFile(void * stream,const char * text,int len)4191 static void outputToFile(void *stream, const char *text, int len) {
4192   fwrite(text, 1, len, (FILE *)stream);
4193 }
4194 
TextOutputDev(char * fileName,TextOutputControl * controlA,GBool append)4195 TextOutputDev::TextOutputDev(char *fileName, TextOutputControl *controlA,
4196 			     GBool append) {
4197   text = NULL;
4198   control = *controlA;
4199   ok = gTrue;
4200 
4201   // open file
4202   needClose = gFalse;
4203   if (fileName) {
4204     if (!strcmp(fileName, "-")) {
4205       outputStream = stdout;
4206 #ifdef WIN32
4207       // keep DOS from munging the end-of-line characters
4208       setmode(fileno(stdout), O_BINARY);
4209 #endif
4210     } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
4211       needClose = gTrue;
4212     } else {
4213       error(errIO, -1, "Couldn't open text file '{0:s}'", fileName);
4214       ok = gFalse;
4215       return;
4216     }
4217     outputFunc = &outputToFile;
4218   } else {
4219     outputStream = NULL;
4220   }
4221 
4222   // set up text object
4223   text = new TextPage(&control);
4224 }
4225 
TextOutputDev(TextOutputFunc func,void * stream,TextOutputControl * controlA)4226 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
4227 			     TextOutputControl *controlA) {
4228   outputFunc = func;
4229   outputStream = stream;
4230   needClose = gFalse;
4231   control = *controlA;
4232   text = new TextPage(&control);
4233   ok = gTrue;
4234 }
4235 
~TextOutputDev()4236 TextOutputDev::~TextOutputDev() {
4237   if (needClose) {
4238     fclose((FILE *)outputStream);
4239   }
4240   if (text) {
4241     delete text;
4242   }
4243 }
4244 
startPage(int pageNum,GfxState * state)4245 void TextOutputDev::startPage(int pageNum, GfxState *state) {
4246   text->startPage(state);
4247 }
4248 
endPage()4249 void TextOutputDev::endPage() {
4250   if (outputStream) {
4251     text->write(outputStream, outputFunc);
4252   }
4253 }
4254 
restoreState(GfxState * state)4255 void TextOutputDev::restoreState(GfxState *state) {
4256   text->updateFont(state);
4257 }
4258 
updateFont(GfxState * state)4259 void TextOutputDev::updateFont(GfxState *state) {
4260   text->updateFont(state);
4261 }
4262 
beginString(GfxState * state,GString * s)4263 void TextOutputDev::beginString(GfxState *state, GString *s) {
4264 }
4265 
endString(GfxState * state)4266 void TextOutputDev::endString(GfxState *state) {
4267 }
4268 
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode c,int nBytes,Unicode * u,int uLen)4269 void TextOutputDev::drawChar(GfxState *state, double x, double y,
4270 			     double dx, double dy,
4271 			     double originX, double originY,
4272 			     CharCode c, int nBytes, Unicode *u, int uLen) {
4273   text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
4274 }
4275 
incCharCount(int nChars)4276 void TextOutputDev::incCharCount(int nChars) {
4277   text->incCharCount(nChars);
4278 }
4279 
beginActualText(GfxState * state,Unicode * u,int uLen)4280 void TextOutputDev::beginActualText(GfxState *state, Unicode *u, int uLen) {
4281   text->beginActualText(state, u, uLen);
4282 }
4283 
endActualText(GfxState * state)4284 void TextOutputDev::endActualText(GfxState *state) {
4285   text->endActualText(state);
4286 }
4287 
stroke(GfxState * state)4288 void TextOutputDev::stroke(GfxState *state) {
4289   GfxPath *path;
4290   GfxSubpath *subpath;
4291   double x[2], y[2];
4292 
4293   if (!control.html) {
4294     return;
4295   }
4296   path = state->getPath();
4297   if (path->getNumSubpaths() != 1) {
4298     return;
4299   }
4300   subpath = path->getSubpath(0);
4301   if (subpath->getNumPoints() != 2) {
4302     return;
4303   }
4304   state->transform(subpath->getX(0), subpath->getY(0), &x[0], &y[0]);
4305   state->transform(subpath->getX(1), subpath->getY(1), &x[1], &y[1]);
4306 
4307   // look for a vertical or horizontal line
4308   if (x[0] == x[1] || y[0] == y[1]) {
4309     text->addUnderline(x[0], y[0], x[1], y[1]);
4310   }
4311 }
4312 
fill(GfxState * state)4313 void TextOutputDev::fill(GfxState *state) {
4314   GfxPath *path;
4315   GfxSubpath *subpath;
4316   double x[5], y[5];
4317   double rx0, ry0, rx1, ry1, t;
4318   int i;
4319 
4320   if (!control.html) {
4321     return;
4322   }
4323   path = state->getPath();
4324   if (path->getNumSubpaths() != 1) {
4325     return;
4326   }
4327   subpath = path->getSubpath(0);
4328   if (subpath->getNumPoints() != 5) {
4329     return;
4330   }
4331   for (i = 0; i < 5; ++i) {
4332     if (subpath->getCurve(i)) {
4333       return;
4334     }
4335     state->transform(subpath->getX(i), subpath->getY(i), &x[i], &y[i]);
4336   }
4337 
4338   // look for a rectangle
4339   if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] &&
4340       x[0] == x[4] && y[0] == y[4]) {
4341     rx0 = x[0];
4342     ry0 = y[0];
4343     rx1 = x[2];
4344     ry1 = y[1];
4345   } else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] &&
4346 	     x[0] == x[4] && y[0] == y[4]) {
4347     rx0 = x[0];
4348     ry0 = y[0];
4349     rx1 = x[1];
4350     ry1 = y[2];
4351   } else {
4352     return;
4353   }
4354   if (rx1 < rx0) {
4355     t = rx0;
4356     rx0 = rx1;
4357     rx1 = t;
4358   }
4359   if (ry1 < ry0) {
4360     t = ry0;
4361     ry0 = ry1;
4362     ry1 = t;
4363   }
4364 
4365   // skinny horizontal rectangle
4366   if (ry1 - ry0 < rx1 - rx0) {
4367     if (ry1 - ry0 < maxUnderlineWidth) {
4368       ry0 = 0.5 * (ry0 + ry1);
4369       text->addUnderline(rx0, ry0, rx1, ry0);
4370     }
4371 
4372   // skinny vertical rectangle
4373   } else {
4374     if (rx1 - rx0 < maxUnderlineWidth) {
4375       rx0 = 0.5 * (rx0 + rx1);
4376       text->addUnderline(rx0, ry0, rx0, ry1);
4377     }
4378   }
4379 }
4380 
eoFill(GfxState * state)4381 void TextOutputDev::eoFill(GfxState *state) {
4382   if (!control.html) {
4383     return;
4384   }
4385   fill(state);
4386 }
4387 
processLink(Link * link)4388 void TextOutputDev::processLink(Link *link) {
4389   double x1, y1, x2, y2;
4390   int xMin, yMin, xMax, yMax, x, y;
4391 
4392   if (!control.html) {
4393     return;
4394   }
4395   link->getRect(&x1, &y1, &x2, &y2);
4396   cvtUserToDev(x1, y1, &x, &y);
4397   xMin = xMax = x;
4398   yMin = yMax = y;
4399   cvtUserToDev(x1, y2, &x, &y);
4400   if (x < xMin) {
4401     xMin = x;
4402   } else if (x > xMax) {
4403     xMax = x;
4404   }
4405   if (y < yMin) {
4406     yMin = y;
4407   } else if (y > yMax) {
4408     yMax = y;
4409   }
4410   cvtUserToDev(x2, y1, &x, &y);
4411   if (x < xMin) {
4412     xMin = x;
4413   } else if (x > xMax) {
4414     xMax = x;
4415   }
4416   if (y < yMin) {
4417     yMin = y;
4418   } else if (y > yMax) {
4419     yMax = y;
4420   }
4421   cvtUserToDev(x2, y2, &x, &y);
4422   if (x < xMin) {
4423     xMin = x;
4424   } else if (x > xMax) {
4425     xMax = x;
4426   }
4427   if (y < yMin) {
4428     yMin = y;
4429   } else if (y > yMax) {
4430     yMax = y;
4431   }
4432   text->addLink(xMin, yMin, xMax, yMax, link);
4433 }
4434 
findText(Unicode * s,int len,GBool startAtTop,GBool stopAtBottom,GBool startAtLast,GBool stopAtLast,GBool caseSensitive,GBool backward,GBool wholeWord,double * xMin,double * yMin,double * xMax,double * yMax)4435 GBool TextOutputDev::findText(Unicode *s, int len,
4436 			      GBool startAtTop, GBool stopAtBottom,
4437 			      GBool startAtLast, GBool stopAtLast,
4438 			      GBool caseSensitive, GBool backward,
4439 			      GBool wholeWord,
4440 			      double *xMin, double *yMin,
4441 			      double *xMax, double *yMax) {
4442   return text->findText(s, len, startAtTop, stopAtBottom,
4443 			startAtLast, stopAtLast,
4444 			caseSensitive, backward, wholeWord,
4445 			xMin, yMin, xMax, yMax);
4446 }
4447 
getText(double xMin,double yMin,double xMax,double yMax)4448 GString *TextOutputDev::getText(double xMin, double yMin,
4449 				double xMax, double yMax) {
4450   return text->getText(xMin, yMin, xMax, yMax);
4451 }
4452 
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax)4453 GBool TextOutputDev::findCharRange(int pos, int length,
4454 				   double *xMin, double *yMin,
4455 				   double *xMax, double *yMax) {
4456   return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
4457 }
4458 
makeWordList()4459 TextWordList *TextOutputDev::makeWordList() {
4460   return text->makeWordList();
4461 }
4462 
takeText()4463 TextPage *TextOutputDev::takeText() {
4464   TextPage *ret;
4465 
4466   ret = text;
4467   text = new TextPage(&control);
4468   return ret;
4469 }
4470