1 //========================================================================
2 //
3 // TextOutputDev.cc
4 //
5 // Copyright 1997-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
14 // Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru>
15 // Copyright (C) 2006-2008 Carlos Garcia Campos <carlosgc@gnome.org>
16 // Copyright (C) 2006, 2007 Ed Catmur <ed@catmur.co.uk>
17 // Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net>
18 // Copyright (C) 2007, 2008 Adrian Johnson <ajohnson@redneon.com>
19 // Copyright (C) 2008 Koji Otani <sho@bbr.jp>
20 // Copyright (C) 2008, 2010 Albert Astals Cid <aacid@kde.org>
21 // Copyright (C) 2008 Pino Toscano <pino@kde.org>
22 // Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl>
23 // Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au>
24 // Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
25 // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
26 // Copyright (C) 2010 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
27 //
28 // To see a description of the changes please see the Changelog file that
29 // came with your tarball or type make ChangeLog if you are building from git
30 //
31 //========================================================================
32 
33 #include <config.h>
34 
35 #ifdef USE_GCC_PRAGMAS
36 #pragma implementation
37 #endif
38 
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <stddef.h>
42 #include <math.h>
43 #include <float.h>
44 #include <ctype.h>
45 #ifdef _WIN32
46 #include <fcntl.h> // for O_BINARY
47 #include <io.h>    // for setmode
48 #endif
49 #include "goo/gmem.h"
50 #include "goo/GooString.h"
51 #include "goo/GooList.h"
52 #include "poppler-config.h"
53 #include "Error.h"
54 #include "GlobalParams.h"
55 #include "UnicodeMap.h"
56 #include "UnicodeTypeTable.h"
57 #include "Link.h"
58 #include "TextOutputDev.h"
59 #include "Page.h"
60 #include "PDFDocEncoding.h"
61 
62 #ifdef MACOS
63 // needed for setting type/creator of MacOS files
64 #include "ICSupport.h"
65 #endif
66 
67 //------------------------------------------------------------------------
68 // parameters
69 //------------------------------------------------------------------------
70 
71 // Each bucket in a text pool includes baselines within a range of
72 // this many points.
73 #define textPoolStep 4
74 
75 // Inter-character space width which will cause addChar to start a new
76 // word.
77 #define minWordBreakSpace 0.1
78 
79 // Negative inter-character space width, i.e., overlap, which will
80 // cause addChar to start a new word.
81 #define minDupBreakOverlap 0.2
82 
83 // Max distance between baselines of two lines within a block, as a
84 // fraction of the font size.
85 #define maxLineSpacingDelta 1.5
86 
87 // Max difference in primary font sizes on two lines in the same
88 // block.  Delta1 is used when examining new lines above and below the
89 // current block; delta2 is used when examining text that overlaps the
90 // current block; delta3 is used when examining text to the left and
91 // right of the current block.
92 #define maxBlockFontSizeDelta1 0.05
93 #define maxBlockFontSizeDelta2 0.6
94 #define maxBlockFontSizeDelta3 0.2
95 
96 // Max difference in font sizes inside a word.
97 #define maxWordFontSizeDelta 0.05
98 
99 // Maximum distance between baselines of two words on the same line,
100 // e.g., distance between subscript or superscript and the primary
101 // baseline, as a fraction of the font size.
102 #define maxIntraLineDelta 0.5
103 
104 // Minimum inter-word spacing, as a fraction of the font size.  (Only
105 // used for raw ordering.)
106 #define minWordSpacing 0.15
107 
108 // Maximum inter-word spacing, as a fraction of the font size.
109 #define maxWordSpacing 1.5
110 
111 // Maximum horizontal spacing which will allow a word to be pulled
112 // into a block.
113 #define minColSpacing1 0.3
114 
115 // Minimum spacing between columns, as a fraction of the font size.
116 #define minColSpacing2 1.0
117 
118 // Maximum vertical spacing between blocks within a flow, as a
119 // multiple of the font size.
120 #define maxBlockSpacing 2.5
121 
122 // Minimum spacing between characters within a word, as a fraction of
123 // the font size.
124 #define minCharSpacing -0.2
125 
126 // Maximum spacing between characters within a word, as a fraction of
127 // the font size, when there is no obvious extra-wide character
128 // spacing.
129 #define maxCharSpacing 0.03
130 
131 // When extra-wide character spacing is detected, the inter-character
132 // space threshold is set to the minimum inter-character space
133 // multiplied by this constant.
134 #define maxWideCharSpacingMul 1.3
135 
136 // Upper limit on spacing between characters in a word.
137 #define maxWideCharSpacing 0.4
138 
139 // Max difference in primary,secondary coordinates (as a fraction of
140 // the font size) allowed for duplicated text (fake boldface, drop
141 // shadows) which is to be discarded.
142 #define dupMaxPriDelta 0.1
143 #define dupMaxSecDelta 0.2
144 
145 // Max width of underlines (in points).
146 #define maxUnderlineWidth 3
147 
148 // Min distance between baseline and underline (in points).
149 //~ this should be font-size-dependent
150 #define minUnderlineGap -2
151 
152 // Max distance between baseline and underline (in points).
153 //~ this should be font-size-dependent
154 #define maxUnderlineGap 4
155 
156 // Max horizontal distance between edge of word and start of underline
157 // (in points).
158 //~ this should be font-size-dependent
159 #define underlineSlack 1
160 
161 // Max distance between edge of text and edge of link border
162 #define hyperlinkSlack 2
163 
164 //------------------------------------------------------------------------
165 // TextUnderline
166 //------------------------------------------------------------------------
167 
168 class TextUnderline {
169 public:
170 
TextUnderline(double x0A,double y0A,double x1A,double y1A)171   TextUnderline(double x0A, double y0A, double x1A, double y1A)
172     { x0 = x0A; y0 = y0A; x1 = x1A; y1 = y1A; horiz = y0 == y1; }
~TextUnderline()173   ~TextUnderline() {}
174 
175   double x0, y0, x1, y1;
176   GBool horiz;
177 };
178 
179 //------------------------------------------------------------------------
180 // TextLink
181 //------------------------------------------------------------------------
182 
183 class TextLink {
184 public:
185 
TextLink(int xMinA,int yMinA,int xMaxA,int yMaxA,Link * linkA)186   TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, Link *linkA)
187     { xMin = xMinA; yMin = yMinA; xMax = xMaxA; yMax = yMaxA; link = linkA; }
~TextLink()188   ~TextLink() {}
189 
190   int xMin, yMin, xMax, yMax;
191   Link *link;
192 };
193 
194 //------------------------------------------------------------------------
195 // TextFontInfo
196 //------------------------------------------------------------------------
197 
TextFontInfo(GfxState * state)198 TextFontInfo::TextFontInfo(GfxState *state) {
199   gfxFont = state->getFont();
200   if (gfxFont)
201     gfxFont->incRefCnt();
202 #if TEXTOUT_WORD_LIST
203   fontName = (gfxFont && gfxFont->getOrigName())
204                  ? gfxFont->getOrigName()->copy()
205                  : (GooString *)NULL;
206   flags = gfxFont ? gfxFont->getFlags() : 0;
207 #endif
208 }
209 
~TextFontInfo()210 TextFontInfo::~TextFontInfo() {
211   if (gfxFont)
212     gfxFont->decRefCnt();
213 #if TEXTOUT_WORD_LIST
214   if (fontName) {
215     delete fontName;
216   }
217 #endif
218 }
219 
matches(GfxState * state)220 GBool TextFontInfo::matches(GfxState *state) {
221   return state->getFont() == gfxFont;
222 }
223 
224 //------------------------------------------------------------------------
225 // TextWord
226 //------------------------------------------------------------------------
227 
TextWord(GfxState * state,int rotA,double x0,double y0,int charPosA,TextFontInfo * fontA,double fontSizeA)228 TextWord::TextWord(GfxState *state, int rotA, double x0, double y0,
229 		   int charPosA, TextFontInfo *fontA, double fontSizeA) {
230   GfxFont *gfxFont;
231   double x, y, ascent, descent;
232 
233   rot = rotA;
234   charPos = charPosA;
235   charLen = 0;
236   font = fontA;
237   fontSize = fontSizeA;
238   state->transform(x0, y0, &x, &y);
239   if ((gfxFont = font->gfxFont)) {
240     ascent = gfxFont->getAscent() * fontSize;
241     descent = gfxFont->getDescent() * fontSize;
242   } else {
243     // this means that the PDF file draws text without a current font,
244     // which should never happen
245     ascent = 0.95 * fontSize;
246     descent = -0.35 * fontSize;
247   }
248   switch (rot) {
249   case 0:
250     yMin = y - ascent;
251     yMax = y - descent;
252     if (yMin == yMax) {
253       // this is a sanity check for a case that shouldn't happen -- but
254       // if it does happen, we want to avoid dividing by zero later
255       yMin = y;
256       yMax = y + 1;
257     }
258     base = y;
259     break;
260   case 1:
261     xMin = x + descent;
262     xMax = x + ascent;
263     if (xMin == xMax) {
264       // this is a sanity check for a case that shouldn't happen -- but
265       // if it does happen, we want to avoid dividing by zero later
266       xMin = x;
267       xMax = x + 1;
268     }
269     base = x;
270     break;
271   case 2:
272     yMin = y + descent;
273     yMax = y + ascent;
274     if (yMin == yMax) {
275       // this is a sanity check for a case that shouldn't happen -- but
276       // if it does happen, we want to avoid dividing by zero later
277       yMin = y;
278       yMax = y + 1;
279     }
280     base = y;
281     break;
282   case 3:
283     xMin = x - ascent;
284     xMax = x - descent;
285     if (xMin == xMax) {
286       // this is a sanity check for a case that shouldn't happen -- but
287       // if it does happen, we want to avoid dividing by zero later
288       xMin = x;
289       xMax = x + 1;
290     }
291     base = x;
292     break;
293   }
294   text = NULL;
295   charcode = NULL;
296   edge = NULL;
297   len = size = 0;
298   spaceAfter = gFalse;
299   next = NULL;
300 
301 #if TEXTOUT_WORD_LIST
302   GfxRGB rgb;
303 
304   if ((state->getRender() & 3) == 1) {
305     state->getStrokeRGB(&rgb);
306   } else {
307     state->getFillRGB(&rgb);
308   }
309   colorR = colToDbl(rgb.r);
310   colorG = colToDbl(rgb.g);
311   colorB = colToDbl(rgb.b);
312 #endif
313 
314   underlined = gFalse;
315   link = NULL;
316 }
317 
~TextWord()318 TextWord::~TextWord() {
319   gfree(text);
320   gfree(charcode);
321   gfree(edge);
322 }
323 
addChar(GfxState * state,double x,double y,double dx,double dy,CharCode c,Unicode u)324 void TextWord::addChar(GfxState *state, double x, double y,
325 		       double dx, double dy, CharCode c, Unicode u) {
326   if (len == size) {
327     size += 16;
328     text = (Unicode *)greallocn(text, size, sizeof(Unicode));
329     charcode = (Unicode *)greallocn(charcode, size, sizeof(CharCode));
330     edge = (double *)greallocn(edge, (size + 1), sizeof(double));
331   }
332   text[len] = u;
333   charcode[len] = c;
334   switch (rot) {
335   case 0:
336     if (len == 0) {
337       xMin = x;
338     }
339     edge[len] = x;
340     xMax = edge[len+1] = x + dx;
341     break;
342   case 1:
343     if (len == 0) {
344       yMin = y;
345     }
346     edge[len] = y;
347     yMax = edge[len+1] = y + dy;
348     break;
349   case 2:
350     if (len == 0) {
351       xMax = x;
352     }
353     edge[len] = x;
354     xMin = edge[len+1] = x + dx;
355     break;
356   case 3:
357     if (len == 0) {
358       yMax = y;
359     }
360     edge[len] = y;
361     yMin = edge[len+1] = y + dy;
362     break;
363   }
364   ++len;
365 }
366 
merge(TextWord * word)367 void TextWord::merge(TextWord *word) {
368   int i;
369 
370   if (word->xMin < xMin) {
371     xMin = word->xMin;
372   }
373   if (word->yMin < yMin) {
374     yMin = word->yMin;
375   }
376   if (word->xMax > xMax) {
377     xMax = word->xMax;
378   }
379   if (word->yMax > yMax) {
380     yMax = word->yMax;
381   }
382   if (len + word->len > size) {
383     size = len + word->len;
384     text = (Unicode *)greallocn(text, size, sizeof(Unicode));
385     charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode));
386     edge = (double *)greallocn(edge, (size + 1), sizeof(double));
387   }
388   for (i = 0; i < word->len; ++i) {
389     text[len + i] = word->text[i];
390     charcode[len + i] = word->charcode[i];
391     edge[len + i] = word->edge[i];
392   }
393   edge[len + word->len] = word->edge[word->len];
394   len += word->len;
395   charLen += word->charLen;
396 }
397 
primaryCmp(TextWord * word)398 inline int TextWord::primaryCmp(TextWord *word) {
399   double cmp;
400 
401   cmp = 0; // make gcc happy
402   switch (rot) {
403   case 0:
404     cmp = xMin - word->xMin;
405     break;
406   case 1:
407     cmp = yMin - word->yMin;
408     break;
409   case 2:
410     cmp = word->xMax - xMax;
411     break;
412   case 3:
413     cmp = word->yMax - yMax;
414     break;
415   }
416   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
417 }
418 
primaryDelta(TextWord * word)419 double TextWord::primaryDelta(TextWord *word) {
420   double delta;
421 
422   delta = 0; // make gcc happy
423   switch (rot) {
424   case 0:
425     delta = word->xMin - xMax;
426     break;
427   case 1:
428     delta = word->yMin - yMax;
429     break;
430   case 2:
431     delta = xMin - word->xMax;
432     break;
433   case 3:
434     delta = yMin - word->yMax;
435     break;
436   }
437   return delta;
438 }
439 
cmpYX(const void * p1,const void * p2)440 int TextWord::cmpYX(const void *p1, const void *p2) {
441   TextWord *word1 = *(TextWord **)p1;
442   TextWord *word2 = *(TextWord **)p2;
443   double cmp;
444 
445   cmp = word1->yMin - word2->yMin;
446   if (cmp == 0) {
447     cmp = word1->xMin - word2->xMin;
448   }
449   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
450 }
451 
452 #if TEXTOUT_WORD_LIST
453 
getText()454 GooString *TextWord::getText() {
455   GooString *s;
456   UnicodeMap *uMap;
457   char buf[8];
458   int n, i;
459 
460   s = new GooString();
461   if (!(uMap = globalParams->getTextEncoding())) {
462     return s;
463   }
464   for (i = 0; i < len; ++i) {
465     n = uMap->mapUnicode(text[i], buf, sizeof(buf));
466     s->append(buf, n);
467   }
468   uMap->decRefCnt();
469   return s;
470 }
471 
getCharBBox(int charIdx,double * xMinA,double * yMinA,double * xMaxA,double * yMaxA)472 void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA,
473 			   double *xMaxA, double *yMaxA) {
474   if (charIdx < 0 || charIdx >= len) {
475     return;
476   }
477   switch (rot) {
478   case 0:
479     *xMinA = edge[charIdx];
480     *xMaxA = edge[charIdx + 1];
481     *yMinA = yMin;
482     *yMaxA = yMax;
483     break;
484   case 1:
485     *xMinA = xMin;
486     *xMaxA = xMax;
487     *yMinA = edge[charIdx];
488     *yMaxA = edge[charIdx + 1];
489     break;
490   case 2:
491     *xMinA = edge[charIdx + 1];
492     *xMaxA = edge[charIdx];
493     *yMinA = yMin;
494     *yMaxA = yMax;
495     break;
496   case 3:
497     *xMinA = xMin;
498     *xMaxA = xMax;
499     *yMinA = edge[charIdx + 1];
500     *yMaxA = edge[charIdx];
501     break;
502   }
503 }
504 
505 #endif // TEXTOUT_WORD_LIST
506 
507 //------------------------------------------------------------------------
508 // TextPool
509 //------------------------------------------------------------------------
510 
TextPool()511 TextPool::TextPool() {
512   minBaseIdx = 0;
513   maxBaseIdx = -1;
514   pool = NULL;
515   cursor = NULL;
516   cursorBaseIdx = -1;
517 }
518 
~TextPool()519 TextPool::~TextPool() {
520   int baseIdx;
521   TextWord *word, *word2;
522 
523   for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
524     for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
525       word2 = word->next;
526       delete word;
527     }
528   }
529   gfree(pool);
530 }
531 
getBaseIdx(double base)532 int TextPool::getBaseIdx(double base) {
533   int baseIdx;
534 
535   baseIdx = (int)(base / textPoolStep);
536   if (baseIdx < minBaseIdx) {
537     return minBaseIdx;
538   }
539   if (baseIdx > maxBaseIdx) {
540     return maxBaseIdx;
541   }
542   return baseIdx;
543 }
544 
addWord(TextWord * word)545 void TextPool::addWord(TextWord *word) {
546   TextWord **newPool;
547   int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
548   TextWord *w0, *w1;
549 
550   // expand the array if needed
551   wordBaseIdx = (int)(word->base / textPoolStep);
552   if (minBaseIdx > maxBaseIdx) {
553     minBaseIdx = wordBaseIdx - 128;
554     maxBaseIdx = wordBaseIdx + 128;
555     pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1,
556 				 sizeof(TextWord *));
557     for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
558       pool[baseIdx - minBaseIdx] = NULL;
559     }
560   } else if (wordBaseIdx < minBaseIdx) {
561     newMinBaseIdx = wordBaseIdx - 128;
562     newPool = (TextWord **)gmallocn(maxBaseIdx - newMinBaseIdx + 1,
563 				    sizeof(TextWord *));
564     for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
565       newPool[baseIdx - newMinBaseIdx] = NULL;
566     }
567     memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
568 	   (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
569     gfree(pool);
570     pool = newPool;
571     minBaseIdx = newMinBaseIdx;
572   } else if (wordBaseIdx > maxBaseIdx) {
573     newMaxBaseIdx = wordBaseIdx + 128;
574     pool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1,
575 				  sizeof(TextWord *));
576     for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
577       pool[baseIdx - minBaseIdx] = NULL;
578     }
579     maxBaseIdx = newMaxBaseIdx;
580   }
581 
582   // insert the new word
583   if (cursor && wordBaseIdx == cursorBaseIdx &&
584       word->primaryCmp(cursor) > 0) {
585     w0 = cursor;
586     w1 = cursor->next;
587   } else {
588     w0 = NULL;
589     w1 = pool[wordBaseIdx - minBaseIdx];
590   }
591   for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
592   word->next = w1;
593   if (w0) {
594     w0->next = word;
595   } else {
596     pool[wordBaseIdx - minBaseIdx] = word;
597   }
598   cursor = word;
599   cursorBaseIdx = wordBaseIdx;
600 }
601 
602 //------------------------------------------------------------------------
603 // TextLine
604 //------------------------------------------------------------------------
605 
TextLine(TextBlock * blkA,int rotA,double baseA)606 TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
607   blk = blkA;
608   rot = rotA;
609   base = baseA;
610   words = lastWord = NULL;
611   text = NULL;
612   edge = NULL;
613   col = NULL;
614   len = 0;
615   convertedLen = 0;
616   hyphenated = gFalse;
617   next = NULL;
618   xMin = yMin = 0;
619   xMax = yMax = -1;
620   normalized = NULL;
621   normalized_len = 0;
622   normalized_idx = NULL;
623 }
624 
~TextLine()625 TextLine::~TextLine() {
626   TextWord *word;
627 
628   while (words) {
629     word = words;
630     words = words->next;
631     delete word;
632   }
633   gfree(text);
634   gfree(edge);
635   gfree(col);
636   if (normalized) {
637     gfree(normalized);
638     gfree(normalized_idx);
639   }
640 }
641 
addWord(TextWord * word)642 void TextLine::addWord(TextWord *word) {
643   if (lastWord) {
644     lastWord->next = word;
645   } else {
646     words = word;
647   }
648   lastWord = word;
649 
650   if (xMin > xMax) {
651     xMin = word->xMin;
652     xMax = word->xMax;
653     yMin = word->yMin;
654     yMax = word->yMax;
655   } else {
656     if (word->xMin < xMin) {
657       xMin = word->xMin;
658     }
659     if (word->xMax > xMax) {
660       xMax = word->xMax;
661     }
662     if (word->yMin < yMin) {
663       yMin = word->yMin;
664     }
665     if (word->yMax > yMax) {
666       yMax = word->yMax;
667     }
668   }
669 }
670 
primaryDelta(TextLine * line)671 double TextLine::primaryDelta(TextLine *line) {
672   double delta;
673 
674   delta = 0; // make gcc happy
675   switch (rot) {
676   case 0:
677     delta = line->xMin - xMax;
678     break;
679   case 1:
680     delta = line->yMin - yMax;
681     break;
682   case 2:
683     delta = xMin - line->xMax;
684     break;
685   case 3:
686     delta = yMin - line->yMax;
687     break;
688   }
689   return delta;
690 }
691 
primaryCmp(TextLine * line)692 int TextLine::primaryCmp(TextLine *line) {
693   double cmp;
694 
695   cmp = 0; // make gcc happy
696   switch (rot) {
697   case 0:
698     cmp = xMin - line->xMin;
699     break;
700   case 1:
701     cmp = yMin - line->yMin;
702     break;
703   case 2:
704     cmp = line->xMax - xMax;
705     break;
706   case 3:
707     cmp = line->yMax - yMax;
708     break;
709   }
710   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
711 }
712 
secondaryCmp(TextLine * line)713 int TextLine::secondaryCmp(TextLine *line) {
714   double cmp;
715 
716   cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
717   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
718 }
719 
cmpYX(TextLine * line)720 int TextLine::cmpYX(TextLine *line) {
721   int cmp;
722 
723   if ((cmp = secondaryCmp(line))) {
724     return cmp;
725   }
726   return primaryCmp(line);
727 }
728 
cmpXY(const void * p1,const void * p2)729 int TextLine::cmpXY(const void *p1, const void *p2) {
730   TextLine *line1 = *(TextLine **)p1;
731   TextLine *line2 = *(TextLine **)p2;
732   int cmp;
733 
734   if ((cmp = line1->primaryCmp(line2))) {
735     return cmp;
736   }
737   return line1->secondaryCmp(line2);
738 }
739 
coalesce(UnicodeMap * uMap)740 void TextLine::coalesce(UnicodeMap *uMap) {
741   TextWord *word0, *word1;
742   double space, delta, minSpace;
743   GBool isUnicode;
744   char buf[8];
745   int i, j;
746 
747   if (words->next) {
748 
749     // compute the inter-word space threshold
750     if (words->len > 1 || words->next->len > 1) {
751       minSpace = 0;
752     } else {
753       minSpace = words->primaryDelta(words->next);
754       for (word0 = words->next, word1 = word0->next;
755 	   word1 && minSpace > 0;
756 	   word0 = word1, word1 = word0->next) {
757 	if (word1->len > 1) {
758 	  minSpace = 0;
759 	}
760 	delta = word0->primaryDelta(word1);
761 	if (delta < minSpace) {
762 	  minSpace = delta;
763 	}
764       }
765     }
766     if (minSpace <= 0) {
767       space = maxCharSpacing * words->fontSize;
768     } else {
769       space = maxWideCharSpacingMul * minSpace;
770       if (space > maxWideCharSpacing * words->fontSize) {
771 	space = maxWideCharSpacing * words->fontSize;
772       }
773     }
774 
775     // merge words
776     word0 = words;
777     word1 = words->next;
778     while (word1) {
779       if (word0->primaryDelta(word1) >= space) {
780 	word0->spaceAfter = gTrue;
781 	word0 = word1;
782 	word1 = word1->next;
783       } else if (word0->font == word1->font &&
784 		 word0->underlined == word1->underlined &&
785 		 fabs(word0->fontSize - word1->fontSize) <
786 		   maxWordFontSizeDelta * words->fontSize &&
787 		 word1->charPos == word0->charPos + word0->charLen) {
788 	word0->merge(word1);
789 	word0->next = word1->next;
790 	delete word1;
791 	word1 = word0->next;
792       } else {
793 	word0 = word1;
794 	word1 = word1->next;
795       }
796     }
797   }
798 
799   // build the line text
800   isUnicode = uMap ? uMap->isUnicode() : gFalse;
801   len = 0;
802   for (word1 = words; word1; word1 = word1->next) {
803     len += word1->len;
804     if (word1->spaceAfter) {
805       ++len;
806     }
807   }
808   text = (Unicode *)gmallocn(len, sizeof(Unicode));
809   edge = (double *)gmallocn(len + 1, sizeof(double));
810   i = 0;
811   for (word1 = words; word1; word1 = word1->next) {
812     for (j = 0; j < word1->len; ++j) {
813       text[i] = word1->text[j];
814       edge[i] = word1->edge[j];
815       ++i;
816     }
817     edge[i] = word1->edge[word1->len];
818     if (word1->spaceAfter) {
819       text[i] = (Unicode)0x0020;
820       ++i;
821     }
822   }
823 
824   // compute convertedLen and set up the col array
825   col = (int *)gmallocn(len + 1, sizeof(int));
826   convertedLen = 0;
827   for (i = 0; i < len; ++i) {
828     col[i] = convertedLen;
829     if (isUnicode) {
830       ++convertedLen;
831     } else if (uMap) {
832       convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
833     }
834   }
835   col[len] = convertedLen;
836 
837   // check for hyphen at end of line
838   //~ need to check for other chars used as hyphens
839   hyphenated = text[len - 1] == (Unicode)'-';
840 }
841 
842 //------------------------------------------------------------------------
843 // TextLineFrag
844 //------------------------------------------------------------------------
845 
846 class TextLineFrag {
847 public:
848 
849   TextLine *line;		// the line object
850   int start, len;		// offset and length of this fragment
851 				//   (in Unicode chars)
852   double xMin, xMax;		// bounding box coordinates
853   double yMin, yMax;
854   double base;			// baseline virtual coordinate
855   int col;			// first column
856 
857   void init(TextLine *lineA, int startA, int lenA);
858   void computeCoords(GBool oneRot);
859 
860   static int cmpYXPrimaryRot(const void *p1, const void *p2);
861   static int cmpYXLineRot(const void *p1, const void *p2);
862   static int cmpXYLineRot(const void *p1, const void *p2);
863   static int cmpXYColumnPrimaryRot(const void *p1, const void *p2);
864   static int cmpXYColumnLineRot(const void *p1, const void *p2);
865 };
866 
init(TextLine * lineA,int startA,int lenA)867 void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
868   line = lineA;
869   start = startA;
870   len = lenA;
871   col = line->col[start];
872 }
873 
computeCoords(GBool oneRot)874 void TextLineFrag::computeCoords(GBool oneRot) {
875   TextBlock *blk;
876   double d0, d1, d2, d3, d4;
877 
878   if (oneRot) {
879 
880     switch (line->rot) {
881     case 0:
882       xMin = line->edge[start];
883       xMax = line->edge[start + len];
884       yMin = line->yMin;
885       yMax = line->yMax;
886       break;
887     case 1:
888       xMin = line->xMin;
889       xMax = line->xMax;
890       yMin = line->edge[start];
891       yMax = line->edge[start + len];
892       break;
893     case 2:
894       xMin = line->edge[start + len];
895       xMax = line->edge[start];
896       yMin = line->yMin;
897       yMax = line->yMax;
898       break;
899     case 3:
900       xMin = line->xMin;
901       xMax = line->xMax;
902       yMin = line->edge[start + len];
903       yMax = line->edge[start];
904       break;
905     }
906     base = line->base;
907 
908   } else {
909 
910     if (line->rot == 0 && line->blk->page->primaryRot == 0) {
911 
912       xMin = line->edge[start];
913       xMax = line->edge[start + len];
914       yMin = line->yMin;
915       yMax = line->yMax;
916       base = line->base;
917 
918     } else {
919 
920       blk = line->blk;
921       d0 = line->edge[start];
922       d1 = line->edge[start + len];
923       d2 = d3 = d4 = 0; // make gcc happy
924 
925       switch (line->rot) {
926       case 0:
927 	d2 = line->yMin;
928 	d3 = line->yMax;
929 	d4 = line->base;
930 	d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
931 	d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
932 	d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
933 	d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
934 	d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
935 	break;
936       case 1:
937 	d2 = line->xMax;
938 	d3 = line->xMin;
939 	d4 = line->base;
940 	d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
941 	d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
942 	d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
943 	d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
944 	d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
945 	break;
946       case 2:
947 	d2 = line->yMax;
948 	d3 = line->yMin;
949 	d4 = line->base;
950 	d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
951 	d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
952 	d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
953 	d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
954 	d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
955 	break;
956       case 3:
957 	d2 = line->xMin;
958 	d3 = line->xMax;
959 	d4 = line->base;
960 	d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
961 	d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
962 	d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
963 	d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
964 	d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
965 	break;
966       }
967 
968       switch (line->blk->page->primaryRot) {
969       case 0:
970 	xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
971 	xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
972 	yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
973 	yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
974 	base = blk->yMin + base * (blk->yMax - blk->yMin);
975 	break;
976       case 1:
977 	xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
978 	xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
979 	yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
980 	yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
981 	base = blk->xMax - d4 * (blk->xMax - blk->xMin);
982 	break;
983       case 2:
984 	xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
985 	xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
986 	yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
987 	yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
988 	base = blk->yMax - d4 * (blk->yMax - blk->yMin);
989 	break;
990       case 3:
991 	xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
992 	xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
993 	yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
994 	yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
995 	base = blk->xMin + d4 * (blk->xMax - blk->xMin);
996 	break;
997       }
998 
999     }
1000   }
1001 }
1002 
cmpYXPrimaryRot(const void * p1,const void * p2)1003 int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
1004   TextLineFrag *frag1 = (TextLineFrag *)p1;
1005   TextLineFrag *frag2 = (TextLineFrag *)p2;
1006   double cmp;
1007 
1008   cmp = 0; // make gcc happy
1009   switch (frag1->line->blk->page->primaryRot) {
1010   case 0:
1011     if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) {
1012       cmp = frag1->xMin - frag2->xMin;
1013     }
1014     break;
1015   case 1:
1016     if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) {
1017       cmp = frag1->yMin - frag2->yMin;
1018     }
1019     break;
1020   case 2:
1021     if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) {
1022       cmp = frag2->xMax - frag1->xMax;
1023     }
1024     break;
1025   case 3:
1026     if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) {
1027       cmp = frag2->yMax - frag1->yMax;
1028     }
1029     break;
1030   }
1031   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1032 }
1033 
cmpYXLineRot(const void * p1,const void * p2)1034 int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
1035   TextLineFrag *frag1 = (TextLineFrag *)p1;
1036   TextLineFrag *frag2 = (TextLineFrag *)p2;
1037   double cmp;
1038 
1039   cmp = 0; // make gcc happy
1040   switch (frag1->line->rot) {
1041   case 0:
1042     if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1043       cmp = frag1->xMin - frag2->xMin;
1044     }
1045     break;
1046   case 1:
1047     if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1048       cmp = frag1->yMin - frag2->yMin;
1049     }
1050     break;
1051   case 2:
1052     if ((cmp = frag2->yMin - frag1->yMin) == 0) {
1053       cmp = frag2->xMax - frag1->xMax;
1054     }
1055     break;
1056   case 3:
1057     if ((cmp = frag1->xMax - frag2->xMax) == 0) {
1058       cmp = frag2->yMax - frag1->yMax;
1059     }
1060     break;
1061   }
1062   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1063 }
1064 
cmpXYLineRot(const void * p1,const void * p2)1065 int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
1066   TextLineFrag *frag1 = (TextLineFrag *)p1;
1067   TextLineFrag *frag2 = (TextLineFrag *)p2;
1068   double cmp;
1069 
1070   cmp = 0; // make gcc happy
1071   switch (frag1->line->rot) {
1072   case 0:
1073     if ((cmp = frag1->xMin - frag2->xMin) == 0) {
1074       cmp = frag1->yMin - frag2->yMin;
1075     }
1076     break;
1077   case 1:
1078     if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1079       cmp = frag2->xMax - frag1->xMax;
1080     }
1081     break;
1082   case 2:
1083     if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1084       cmp = frag2->yMin - frag1->yMin;
1085     }
1086     break;
1087   case 3:
1088     if ((cmp = frag2->yMax - frag1->yMax) == 0) {
1089       cmp = frag1->xMax - frag2->xMax;
1090     }
1091     break;
1092   }
1093   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1094 }
1095 
cmpXYColumnPrimaryRot(const void * p1,const void * p2)1096 int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2) {
1097   TextLineFrag *frag1 = (TextLineFrag *)p1;
1098   TextLineFrag *frag2 = (TextLineFrag *)p2;
1099   double cmp;
1100 
1101   // if columns overlap, compare y values
1102   if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
1103 				 frag2->line->col[frag2->start]) &&
1104       frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
1105 				 frag1->line->col[frag1->start])) {
1106     cmp = 0; // make gcc happy
1107     switch (frag1->line->blk->page->primaryRot) {
1108     case 0: cmp = frag1->yMin - frag2->yMin; break;
1109     case 1: cmp = frag2->xMax - frag1->xMax; break;
1110     case 2: cmp = frag2->yMin - frag1->yMin; break;
1111     case 3: cmp = frag1->xMax - frag2->xMax; break;
1112     }
1113     return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1114   }
1115 
1116   // otherwise, compare starting column
1117   return frag1->col - frag2->col;
1118 }
1119 
cmpXYColumnLineRot(const void * p1,const void * p2)1120 int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2) {
1121   TextLineFrag *frag1 = (TextLineFrag *)p1;
1122   TextLineFrag *frag2 = (TextLineFrag *)p2;
1123   double cmp;
1124 
1125   // if columns overlap, compare y values
1126   if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
1127 				 frag2->line->col[frag2->start]) &&
1128       frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
1129 				 frag1->line->col[frag1->start])) {
1130     cmp = 0; // make gcc happy
1131     switch (frag1->line->rot) {
1132     case 0: cmp = frag1->yMin - frag2->yMin; break;
1133     case 1: cmp = frag2->xMax - frag1->xMax; break;
1134     case 2: cmp = frag2->yMin - frag1->yMin; break;
1135     case 3: cmp = frag1->xMax - frag2->xMax; break;
1136     }
1137     return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1138   }
1139 
1140   // otherwise, compare starting column
1141   return frag1->col - frag2->col;
1142 }
1143 
1144 //------------------------------------------------------------------------
1145 // TextBlock
1146 //------------------------------------------------------------------------
1147 
TextBlock(TextPage * pageA,int rotA)1148 TextBlock::TextBlock(TextPage *pageA, int rotA) {
1149   page = pageA;
1150   rot = rotA;
1151   xMin = yMin = 0;
1152   xMax = yMax = -1;
1153   priMin = 0;
1154   priMax = page->pageWidth;
1155   pool = new TextPool();
1156   lines = NULL;
1157   curLine = NULL;
1158   next = NULL;
1159   stackNext = NULL;
1160   tableId = -1;
1161   tableEnd = gFalse;
1162 }
1163 
~TextBlock()1164 TextBlock::~TextBlock() {
1165   TextLine *line;
1166 
1167   delete pool;
1168   while (lines) {
1169     line = lines;
1170     lines = lines->next;
1171     delete line;
1172   }
1173 }
1174 
addWord(TextWord * word)1175 void TextBlock::addWord(TextWord *word) {
1176   pool->addWord(word);
1177   if (xMin > xMax) {
1178     xMin = word->xMin;
1179     xMax = word->xMax;
1180     yMin = word->yMin;
1181     yMax = word->yMax;
1182   } else {
1183     if (word->xMin < xMin) {
1184       xMin = word->xMin;
1185     }
1186     if (word->xMax > xMax) {
1187       xMax = word->xMax;
1188     }
1189     if (word->yMin < yMin) {
1190       yMin = word->yMin;
1191     }
1192     if (word->yMax > yMax) {
1193       yMax = word->yMax;
1194     }
1195   }
1196 }
1197 
coalesce(UnicodeMap * uMap)1198 void TextBlock::coalesce(UnicodeMap *uMap) {
1199   TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1200   TextLine *line, *line0, *line1;
1201   int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1202   int baseIdx, bestWordBaseIdx, idx0, idx1;
1203   double minBase, maxBase;
1204   double fontSize, delta, priDelta, secDelta;
1205   TextLine **lineArray;
1206   GBool found;
1207   int col1, col2;
1208   int i, j, k;
1209 
1210   // discard duplicated text (fake boldface, drop shadows)
1211   for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1212     word0 = pool->getPool(idx0);
1213     while (word0) {
1214       priDelta = dupMaxPriDelta * word0->fontSize;
1215       secDelta = dupMaxSecDelta * word0->fontSize;
1216       if (rot == 0 || rot == 3) {
1217 	maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1218       } else {
1219 	maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
1220       }
1221       found = gFalse;
1222       word1 = word2 = NULL; // make gcc happy
1223       for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1224 	if (idx1 == idx0) {
1225 	  word1 = word0;
1226 	  word2 = word0->next;
1227 	} else {
1228 	  word1 = NULL;
1229 	  word2 = pool->getPool(idx1);
1230 	}
1231 	for (; word2; word1 = word2, word2 = word2->next) {
1232 	  if (word2->len == word0->len &&
1233 	      !memcmp(word2->text, word0->text,
1234 		      word0->len * sizeof(Unicode))) {
1235 	    switch (rot) {
1236 	    case 0:
1237 	    case 2:
1238 	      found = fabs(word0->xMin - word2->xMin) < priDelta &&
1239 		      fabs(word0->xMax - word2->xMax) < priDelta &&
1240 		      fabs(word0->yMin - word2->yMin) < secDelta &&
1241 		      fabs(word0->yMax - word2->yMax) < secDelta;
1242 	      break;
1243 	    case 1:
1244 	    case 3:
1245 	      found = fabs(word0->xMin - word2->xMin) < secDelta &&
1246 		      fabs(word0->xMax - word2->xMax) < secDelta &&
1247 		      fabs(word0->yMin - word2->yMin) < priDelta &&
1248 		      fabs(word0->yMax - word2->yMax) < priDelta;
1249 	      break;
1250 	    }
1251 	  }
1252 	  if (found) {
1253 	    break;
1254 	  }
1255 	}
1256 	if (found) {
1257 	  break;
1258 	}
1259       }
1260       if (found) {
1261 	if (word1) {
1262 	  word1->next = word2->next;
1263 	} else {
1264 	  pool->setPool(idx1, word2->next);
1265 	}
1266 	delete word2;
1267       } else {
1268 	word0 = word0->next;
1269       }
1270     }
1271   }
1272 
1273   // build the lines
1274   curLine = NULL;
1275   poolMinBaseIdx = pool->minBaseIdx;
1276   charCount = 0;
1277   nLines = 0;
1278   while (1) {
1279 
1280     // find the first non-empty line in the pool
1281     for (;
1282 	 poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
1283 	 ++poolMinBaseIdx) ;
1284     if (poolMinBaseIdx > pool->maxBaseIdx) {
1285       break;
1286     }
1287 
1288     // look for the left-most word in the first four lines of the
1289     // pool -- this avoids starting with a superscript word
1290     startBaseIdx = poolMinBaseIdx;
1291     for (baseIdx = poolMinBaseIdx + 1;
1292 	 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
1293 	 ++baseIdx) {
1294       if (!pool->getPool(baseIdx)) {
1295 	continue;
1296       }
1297       if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
1298 	  < 0) {
1299 	startBaseIdx = baseIdx;
1300       }
1301     }
1302 
1303     // create a new line
1304     word0 = pool->getPool(startBaseIdx);
1305     pool->setPool(startBaseIdx, word0->next);
1306     word0->next = NULL;
1307     line = new TextLine(this, word0->rot, word0->base);
1308     line->addWord(word0);
1309     lastWord = word0;
1310 
1311     // compute the search range
1312     fontSize = word0->fontSize;
1313     minBase = word0->base - maxIntraLineDelta * fontSize;
1314     maxBase = word0->base + maxIntraLineDelta * fontSize;
1315     minBaseIdx = pool->getBaseIdx(minBase);
1316     maxBaseIdx = pool->getBaseIdx(maxBase);
1317 
1318     // find the rest of the words in this line
1319     while (1) {
1320 
1321       // find the left-most word whose baseline is in the range for
1322       // this line
1323       bestWordBaseIdx = 0;
1324       bestWord0 = bestWord1 = NULL;
1325       for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
1326 	for (word0 = NULL, word1 = pool->getPool(baseIdx);
1327 	     word1;
1328 	     word0 = word1, word1 = word1->next) {
1329 	  if (word1->base >= minBase &&
1330 	      word1->base <= maxBase &&
1331 	      (delta = lastWord->primaryDelta(word1)) >=
1332 	        minCharSpacing * fontSize) {
1333 	    if (delta < maxWordSpacing * fontSize &&
1334 		(!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1335 	      bestWordBaseIdx = baseIdx;
1336 	      bestWord0 = word0;
1337 	      bestWord1 = word1;
1338 	    }
1339 	    break;
1340 	  }
1341 	}
1342       }
1343       if (!bestWord1) {
1344 	break;
1345       }
1346 
1347       // remove it from the pool, and add it to the line
1348       if (bestWord0) {
1349 	bestWord0->next = bestWord1->next;
1350       } else {
1351 	pool->setPool(bestWordBaseIdx, bestWord1->next);
1352       }
1353       bestWord1->next = NULL;
1354       line->addWord(bestWord1);
1355       lastWord = bestWord1;
1356     }
1357 
1358     // add the line
1359     if (curLine && line->cmpYX(curLine) > 0) {
1360       line0 = curLine;
1361       line1 = curLine->next;
1362     } else {
1363       line0 = NULL;
1364       line1 = lines;
1365     }
1366     for (;
1367 	 line1 && line->cmpYX(line1) > 0;
1368 	 line0 = line1, line1 = line1->next) ;
1369     if (line0) {
1370       line0->next = line;
1371     } else {
1372       lines = line;
1373     }
1374     line->next = line1;
1375     curLine = line;
1376     line->coalesce(uMap);
1377     charCount += line->len;
1378     ++nLines;
1379   }
1380 
1381   // sort lines into xy order for column assignment
1382   lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *));
1383   for (line = lines, i = 0; line; line = line->next, ++i) {
1384     lineArray[i] = line;
1385   }
1386   qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1387 
1388   // column assignment
1389   nColumns = 0;
1390   for (i = 0; i < nLines; ++i) {
1391     line0 = lineArray[i];
1392     col1 = 0;
1393     for (j = 0; j < i; ++j) {
1394       line1 = lineArray[j];
1395       if (line1->primaryDelta(line0) >= 0) {
1396 	col2 = line1->col[line1->len] + 1;
1397       } else {
1398 	k = 0; // make gcc happy
1399 	switch (rot) {
1400 	case 0:
1401 	  for (k = 0;
1402 	       k < line1->len &&
1403 		 line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1404 	       ++k) ;
1405 	  break;
1406 	case 1:
1407 	  for (k = 0;
1408 	       k < line1->len &&
1409 		 line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1410 	       ++k) ;
1411 	  break;
1412 	case 2:
1413 	  for (k = 0;
1414 	       k < line1->len &&
1415 		 line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1416 	       ++k) ;
1417 	  break;
1418 	case 3:
1419 	  for (k = 0;
1420 	       k < line1->len &&
1421 		 line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1422 	       ++k) ;
1423 	  break;
1424 	}
1425 	col2 = line1->col[k];
1426       }
1427       if (col2 > col1) {
1428 	col1 = col2;
1429       }
1430     }
1431     for (k = 0; k <= line0->len; ++k) {
1432       line0->col[k] += col1;
1433     }
1434     if (line0->col[line0->len] > nColumns) {
1435       nColumns = line0->col[line0->len];
1436     }
1437   }
1438   gfree(lineArray);
1439 }
1440 
updatePriMinMax(TextBlock * blk)1441 void TextBlock::updatePriMinMax(TextBlock *blk) {
1442   double newPriMin, newPriMax;
1443   GBool gotPriMin, gotPriMax;
1444 
1445   gotPriMin = gotPriMax = gFalse;
1446   newPriMin = newPriMax = 0; // make gcc happy
1447   switch (page->primaryRot) {
1448   case 0:
1449   case 2:
1450     if (blk->yMin < yMax && blk->yMax > yMin) {
1451       if (blk->xMin < xMin) {
1452 	newPriMin = blk->xMax;
1453 	gotPriMin = gTrue;
1454       }
1455       if (blk->xMax > xMax) {
1456 	newPriMax = blk->xMin;
1457 	gotPriMax = gTrue;
1458       }
1459     }
1460     break;
1461   case 1:
1462   case 3:
1463     if (blk->xMin < xMax && blk->xMax > xMin) {
1464       if (blk->yMin < yMin) {
1465 	newPriMin = blk->yMax;
1466 	gotPriMin = gTrue;
1467       }
1468       if (blk->yMax > yMax) {
1469 	newPriMax = blk->yMin;
1470 	gotPriMax = gTrue;
1471       }
1472     }
1473     break;
1474   }
1475   if (gotPriMin) {
1476     if (newPriMin > xMin) {
1477       newPriMin = xMin;
1478     }
1479     if (newPriMin > priMin) {
1480       priMin = newPriMin;
1481     }
1482   }
1483   if (gotPriMax) {
1484     if (newPriMax < xMax) {
1485       newPriMax = xMax;
1486     }
1487     if (newPriMax < priMax) {
1488       priMax = newPriMax;
1489     }
1490   }
1491 }
1492 
cmpXYPrimaryRot(const void * p1,const void * p2)1493 int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
1494   TextBlock *blk1 = *(TextBlock **)p1;
1495   TextBlock *blk2 = *(TextBlock **)p2;
1496   double cmp;
1497 
1498   cmp = 0; // make gcc happy
1499   switch (blk1->page->primaryRot) {
1500   case 0:
1501     if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1502       cmp = blk1->yMin - blk2->yMin;
1503     }
1504     break;
1505   case 1:
1506     if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1507       cmp = blk2->xMax - blk1->xMax;
1508     }
1509     break;
1510   case 2:
1511     if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1512       cmp = blk2->yMin - blk1->yMin;
1513     }
1514     break;
1515   case 3:
1516     if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1517       cmp = blk1->xMax - blk2->xMax;
1518     }
1519     break;
1520   }
1521   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1522 }
1523 
cmpYXPrimaryRot(const void * p1,const void * p2)1524 int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
1525   TextBlock *blk1 = *(TextBlock **)p1;
1526   TextBlock *blk2 = *(TextBlock **)p2;
1527   double cmp;
1528 
1529   cmp = 0; // make gcc happy
1530   switch (blk1->page->primaryRot) {
1531   case 0:
1532     if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1533       cmp = blk1->xMin - blk2->xMin;
1534     }
1535     break;
1536   case 1:
1537     if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1538       cmp = blk1->yMin - blk2->yMin;
1539     }
1540     break;
1541   case 2:
1542     if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1543       cmp = blk2->xMax - blk1->xMax;
1544     }
1545     break;
1546   case 3:
1547     if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1548       cmp = blk2->yMax - blk1->yMax;
1549     }
1550     break;
1551   }
1552   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1553 }
1554 
primaryCmp(TextBlock * blk)1555 int TextBlock::primaryCmp(TextBlock *blk) {
1556   double cmp;
1557 
1558   cmp = 0; // make gcc happy
1559   switch (rot) {
1560   case 0:
1561     cmp = xMin - blk->xMin;
1562     break;
1563   case 1:
1564     cmp = yMin - blk->yMin;
1565     break;
1566   case 2:
1567     cmp = blk->xMax - xMax;
1568     break;
1569   case 3:
1570     cmp = blk->yMax - yMax;
1571     break;
1572   }
1573   return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1574 }
1575 
secondaryDelta(TextBlock * blk)1576 double TextBlock::secondaryDelta(TextBlock *blk) {
1577   double delta;
1578 
1579   delta = 0; // make gcc happy
1580   switch (rot) {
1581   case 0:
1582     delta = blk->yMin - yMax;
1583     break;
1584   case 1:
1585     delta = xMin - blk->xMax;
1586     break;
1587   case 2:
1588     delta = yMin - blk->yMax;
1589     break;
1590   case 3:
1591     delta = blk->xMin - xMax;
1592     break;
1593   }
1594   return delta;
1595 }
1596 
isBelow(TextBlock * blk)1597 GBool TextBlock::isBelow(TextBlock *blk) {
1598   GBool below;
1599 
1600   below = gFalse; // make gcc happy
1601   switch (page->primaryRot) {
1602   case 0:
1603     below = xMin >= blk->priMin && xMax <= blk->priMax &&
1604             yMin > blk->yMin;
1605     break;
1606   case 1:
1607     below = yMin >= blk->priMin && yMax <= blk->priMax &&
1608             xMax < blk->xMax;
1609     break;
1610   case 2:
1611     below = xMin >= blk->priMin && xMax <= blk->priMax &&
1612             yMax < blk->yMax;
1613     break;
1614   case 3:
1615     below = yMin >= blk->priMin && yMax <= blk->priMax &&
1616             xMin > blk->xMin;
1617     break;
1618   }
1619 
1620   return below;
1621 }
1622 
isBeforeByRule1(TextBlock * blk1)1623 GBool TextBlock::isBeforeByRule1(TextBlock *blk1) {
1624   GBool before = gFalse;
1625   GBool overlap = gFalse;
1626 
1627   switch (this->page->primaryRot) {
1628   case 0:
1629   case 2:
1630     overlap = ((this->ExMin <= blk1->ExMin) &&
1631 	       (blk1->ExMin <= this->ExMax)) ||
1632       ((blk1->ExMin <= this->ExMin) &&
1633        (this->ExMin <= blk1->ExMax));
1634     break;
1635   case 1:
1636   case 3:
1637     overlap = ((this->EyMin <= blk1->EyMin) &&
1638 	       (blk1->EyMin <= this->EyMax)) ||
1639       ((blk1->EyMin <= this->EyMin) &&
1640        (this->EyMin <= blk1->EyMax));
1641     break;
1642   }
1643   switch (this->page->primaryRot) {
1644   case 0:
1645     before = overlap && this->EyMin < blk1->EyMin;
1646     break;
1647   case 1:
1648     before = overlap && this->ExMax > blk1->ExMax;
1649     break;
1650   case 2:
1651     before = overlap && this->EyMax > blk1->EyMax;
1652     break;
1653   case 3:
1654     before = overlap && this->ExMin < blk1->ExMin;
1655     break;
1656   }
1657   return before;
1658 }
1659 
isBeforeByRule2(TextBlock * blk1)1660 GBool TextBlock::isBeforeByRule2(TextBlock *blk1) {
1661   double cmp = 0;
1662   int rotLR = rot;
1663 
1664   if (!page->primaryLR) {
1665     rotLR = (rotLR + 2) % 4;
1666   }
1667 
1668   switch (rotLR) {
1669   case 0:
1670     cmp = ExMax - blk1->ExMin;
1671     break;
1672   case 1:
1673     cmp = EyMin - blk1->EyMax;
1674     break;
1675   case 2:
1676     cmp = blk1->ExMax - ExMin;
1677     break;
1678   case 3:
1679     cmp = blk1->EyMin - EyMax;
1680     break;
1681   }
1682   return cmp <= 0;
1683 }
1684 
1685 // Sort into reading order by performing a topological sort using the rules
1686 // given in "High Performance Document Layout Analysis", T.M. Breuel, 2003.
1687 // See http://pubs.iupr.org/#2003-breuel-sdiut
1688 // Topological sort is done by depth first search, see
1689 // http://en.wikipedia.org/wiki/Topological_sorting
visitDepthFirst(TextBlock * blkList,int pos1,TextBlock ** sorted,int sortPos,GBool * visited)1690 int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1,
1691 			       TextBlock **sorted, int sortPos,
1692 			       GBool* visited) {
1693   int pos2;
1694   TextBlock *blk1, *blk2, *blk3;
1695   GBool before;
1696 
1697   if (visited[pos1]) {
1698     return sortPos;
1699   }
1700 
1701   blk1 = this;
1702 
1703 #if 0 // for debugging
1704   printf("visited: %d %.2f..%.2f %.2f..%.2f\n",
1705 	 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
1706 #endif
1707   visited[pos1] = gTrue;
1708   pos2 = -1;
1709   for (blk2 = blkList; blk2; blk2 = blk2->next) {
1710     pos2++;
1711     if (visited[pos2]) {
1712       // skip visited nodes
1713       continue;
1714     }
1715     before = gFalse;
1716 
1717     // is blk2 before blk1? (for table entries)
1718     if (blk1->tableId >= 0 && blk1->tableId == blk2->tableId) {
1719       if (page->primaryLR) {
1720         if (blk2->xMax <= blk1->xMin &&
1721             blk2->yMin <= blk1->yMax &&
1722             blk2->yMax >= blk1->yMin)
1723           before = gTrue;
1724       } else {
1725         if (blk2->xMin >= blk1->xMax &&
1726             blk2->yMin <= blk1->yMax &&
1727             blk2->yMax >= blk1->yMin)
1728           before = gTrue;
1729       }
1730 
1731       if (blk2->yMax <= blk1->yMin)
1732         before = gTrue;
1733     } else {
1734       if (blk2->isBeforeByRule1(blk1)) {
1735         // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1.
1736         before = gTrue;
1737 #if 0   // for debugging
1738         printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
1739 	       blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax,
1740 	       blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
1741 #endif
1742       } else if (blk2->isBeforeByRule2(blk1)) {
1743         // Rule (2) blk2 left of blk1, and no intervening blk3
1744         //          such that blk1 is before blk3 by rule 1,
1745         //          and blk3 is before blk2 by rule 1.
1746         before = gTrue;
1747         for (blk3 = blkList; blk3; blk3 = blk3->next) {
1748 	  if (blk3 == blk2 || blk3 == blk1) {
1749 	    continue;
1750 	  }
1751 	  if (blk1->isBeforeByRule1(blk3) &&
1752 	      blk3->isBeforeByRule1(blk2)) {
1753 	    before = gFalse;
1754 	    break;
1755 	  }
1756         }
1757 #if 0 // for debugging
1758         if (before) {
1759 	  printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
1760 	         blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax,
1761 	         blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax);
1762         }
1763 #endif
1764       }
1765     }
1766     if (before) {
1767       // blk2 is before blk1, so it needs to be visited
1768       // before we can add blk1 to the sorted list.
1769       sortPos = blk2->visitDepthFirst(blkList, pos2, sorted, sortPos, visited);
1770     }
1771   }
1772 #if 0 // for debugging
1773   printf("sorted: %d %.2f..%.2f %.2f..%.2f\n",
1774 	 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
1775 #endif
1776   sorted[sortPos++] = blk1;
1777   return sortPos;
1778 }
1779 
1780 //------------------------------------------------------------------------
1781 // TextFlow
1782 //------------------------------------------------------------------------
1783 
TextFlow(TextPage * pageA,TextBlock * blk)1784 TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
1785   page = pageA;
1786   xMin = blk->xMin;
1787   xMax = blk->xMax;
1788   yMin = blk->yMin;
1789   yMax = blk->yMax;
1790   priMin = blk->priMin;
1791   priMax = blk->priMax;
1792   blocks = lastBlk = blk;
1793   next = NULL;
1794 }
1795 
~TextFlow()1796 TextFlow::~TextFlow() {
1797   TextBlock *blk;
1798 
1799   while (blocks) {
1800     blk = blocks;
1801     blocks = blocks->next;
1802     delete blk;
1803   }
1804 }
1805 
addBlock(TextBlock * blk)1806 void TextFlow::addBlock(TextBlock *blk) {
1807   if (lastBlk) {
1808     lastBlk->next = blk;
1809   } else {
1810     blocks = blk;
1811   }
1812   lastBlk = blk;
1813   if (blk->xMin < xMin) {
1814     xMin = blk->xMin;
1815   }
1816   if (blk->xMax > xMax) {
1817     xMax = blk->xMax;
1818   }
1819   if (blk->yMin < yMin) {
1820     yMin = blk->yMin;
1821   }
1822   if (blk->yMax > yMax) {
1823     yMax = blk->yMax;
1824   }
1825 }
1826 
blockFits(TextBlock * blk,TextBlock * prevBlk)1827 GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
1828   GBool fits;
1829 
1830   // lower blocks must use smaller fonts
1831   if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
1832     return gFalse;
1833   }
1834 
1835   fits = gFalse; // make gcc happy
1836   switch (page->primaryRot) {
1837   case 0:
1838     fits = blk->xMin >= priMin && blk->xMax <= priMax;
1839     break;
1840   case 1:
1841     fits = blk->yMin >= priMin && blk->yMax <= priMax;
1842     break;
1843   case 2:
1844     fits = blk->xMin >= priMin && blk->xMax <= priMax;
1845     break;
1846   case 3:
1847     fits = blk->yMin >= priMin && blk->yMax <= priMax;
1848     break;
1849   }
1850   return fits;
1851 }
1852 
1853 #if TEXTOUT_WORD_LIST
1854 
1855 //------------------------------------------------------------------------
1856 // TextWordList
1857 //------------------------------------------------------------------------
1858 
TextWordList(TextPage * text,GBool physLayout)1859 TextWordList::TextWordList(TextPage *text, GBool physLayout) {
1860   TextFlow *flow;
1861   TextBlock *blk;
1862   TextLine *line;
1863   TextWord *word;
1864   TextWord **wordArray;
1865   int nWords, i;
1866 
1867   words = new GooList();
1868 
1869   if (text->rawOrder) {
1870     for (word = text->rawWords; word; word = word->next) {
1871       words->append(word);
1872     }
1873 
1874   } else if (physLayout) {
1875     // this is inefficient, but it's also the least useful of these
1876     // three cases
1877     nWords = 0;
1878     for (flow = text->flows; flow; flow = flow->next) {
1879       for (blk = flow->blocks; blk; blk = blk->next) {
1880 	for (line = blk->lines; line; line = line->next) {
1881 	  for (word = line->words; word; word = word->next) {
1882 	    ++nWords;
1883 	  }
1884 	}
1885       }
1886     }
1887     wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *));
1888     i = 0;
1889     for (flow = text->flows; flow; flow = flow->next) {
1890       for (blk = flow->blocks; blk; blk = blk->next) {
1891 	for (line = blk->lines; line; line = line->next) {
1892 	  for (word = line->words; word; word = word->next) {
1893 	    wordArray[i++] = word;
1894 	  }
1895 	}
1896       }
1897     }
1898     qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
1899     for (i = 0; i < nWords; ++i) {
1900       words->append(wordArray[i]);
1901     }
1902     gfree(wordArray);
1903 
1904   } else {
1905     for (flow = text->flows; flow; flow = flow->next) {
1906       for (blk = flow->blocks; blk; blk = blk->next) {
1907 	for (line = blk->lines; line; line = line->next) {
1908 	  for (word = line->words; word; word = word->next) {
1909 	    words->append(word);
1910 	  }
1911 	}
1912       }
1913     }
1914   }
1915 }
1916 
~TextWordList()1917 TextWordList::~TextWordList() {
1918   delete words;
1919 }
1920 
getLength()1921 int TextWordList::getLength() {
1922   return words->getLength();
1923 }
1924 
get(int idx)1925 TextWord *TextWordList::get(int idx) {
1926   if (idx < 0 || idx >= words->getLength()) {
1927     return NULL;
1928   }
1929   return (TextWord *)words->get(idx);
1930 }
1931 
1932 #endif // TEXTOUT_WORD_LIST
1933 
1934 //------------------------------------------------------------------------
1935 // TextPage
1936 //------------------------------------------------------------------------
1937 
TextPage(GBool rawOrderA)1938 TextPage::TextPage(GBool rawOrderA) {
1939   int rot;
1940 
1941   refCnt = 1;
1942   rawOrder = rawOrderA;
1943   curWord = NULL;
1944   charPos = 0;
1945   curFont = NULL;
1946   curFontSize = 0;
1947   nest = 0;
1948   nTinyChars = 0;
1949   lastCharOverlap = gFalse;
1950   if (!rawOrder) {
1951     for (rot = 0; rot < 4; ++rot) {
1952       pools[rot] = new TextPool();
1953     }
1954   }
1955   flows = NULL;
1956   blocks = NULL;
1957   rawWords = NULL;
1958   rawLastWord = NULL;
1959   fonts = new GooList();
1960   lastFindXMin = lastFindYMin = 0;
1961   haveLastFind = gFalse;
1962   underlines = new GooList();
1963   links = new GooList();
1964 }
1965 
~TextPage()1966 TextPage::~TextPage() {
1967   int rot;
1968 
1969   clear();
1970   if (!rawOrder) {
1971     for (rot = 0; rot < 4; ++rot) {
1972       delete pools[rot];
1973     }
1974   }
1975   delete fonts;
1976   deleteGooList(underlines, TextUnderline);
1977   deleteGooList(links, TextLink);
1978 }
1979 
incRefCnt()1980 void TextPage::incRefCnt() {
1981   refCnt++;
1982 }
1983 
decRefCnt()1984 void TextPage::decRefCnt() {
1985   if (--refCnt == 0)
1986     delete this;
1987 }
1988 
startPage(GfxState * state)1989 void TextPage::startPage(GfxState *state) {
1990   clear();
1991   if (state) {
1992     pageWidth = state->getPageWidth();
1993     pageHeight = state->getPageHeight();
1994   } else {
1995     pageWidth = pageHeight = 0;
1996   }
1997 }
1998 
endPage()1999 void TextPage::endPage() {
2000   if (curWord) {
2001     endWord();
2002   }
2003 }
2004 
clear()2005 void TextPage::clear() {
2006   int rot;
2007   TextFlow *flow;
2008   TextWord *word;
2009 
2010   if (curWord) {
2011     delete curWord;
2012     curWord = NULL;
2013   }
2014   if (rawOrder) {
2015     while (rawWords) {
2016       word = rawWords;
2017       rawWords = rawWords->next;
2018       delete word;
2019     }
2020   } else {
2021     for (rot = 0; rot < 4; ++rot) {
2022       delete pools[rot];
2023     }
2024     while (flows) {
2025       flow = flows;
2026       flows = flows->next;
2027       delete flow;
2028     }
2029     gfree(blocks);
2030   }
2031   deleteGooList(fonts, TextFontInfo);
2032 
2033   curWord = NULL;
2034   charPos = 0;
2035   curFont = NULL;
2036   curFontSize = 0;
2037   nest = 0;
2038   nTinyChars = 0;
2039   if (!rawOrder) {
2040     for (rot = 0; rot < 4; ++rot) {
2041       pools[rot] = new TextPool();
2042     }
2043   }
2044   flows = NULL;
2045   blocks = NULL;
2046   rawWords = NULL;
2047   rawLastWord = NULL;
2048   fonts = new GooList();
2049 }
2050 
updateFont(GfxState * state)2051 void TextPage::updateFont(GfxState *state) {
2052   GfxFont *gfxFont;
2053   double *fm;
2054   char *name;
2055   int code, mCode, letterCode, anyCode;
2056   double w;
2057   int i;
2058 
2059   // get the font info object
2060   curFont = NULL;
2061   for (i = 0; i < fonts->getLength(); ++i) {
2062     curFont = (TextFontInfo *)fonts->get(i);
2063     if (curFont->matches(state)) {
2064       break;
2065     }
2066     curFont = NULL;
2067   }
2068   if (!curFont) {
2069     curFont = new TextFontInfo(state);
2070     fonts->append(curFont);
2071   }
2072 
2073   // adjust the font size
2074   gfxFont = state->getFont();
2075   curFontSize = state->getTransformedFontSize();
2076   if (gfxFont && gfxFont->getType() == fontType3) {
2077     // This is a hack which makes it possible to deal with some Type 3
2078     // fonts.  The problem is that it's impossible to know what the
2079     // base coordinate system used in the font is without actually
2080     // rendering the font.  This code tries to guess by looking at the
2081     // width of the character 'm' (which breaks if the font is a
2082     // subset that doesn't contain 'm').
2083     mCode = letterCode = anyCode = -1;
2084     for (code = 0; code < 256; ++code) {
2085       name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
2086       if (name && name[0] == 'm' && name[1] == '\0') {
2087 	mCode = code;
2088       }
2089       if (letterCode < 0 && name && name[1] == '\0' &&
2090 	  ((name[0] >= 'A' && name[0] <= 'Z') ||
2091 	   (name[0] >= 'a' && name[0] <= 'z'))) {
2092 	letterCode = code;
2093       }
2094       if (anyCode < 0 && name &&
2095 	  ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
2096 	anyCode = code;
2097       }
2098     }
2099     if (mCode >= 0 &&
2100 	(w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
2101       // 0.6 is a generic average 'm' width -- yes, this is a hack
2102       curFontSize *= w / 0.6;
2103     } else if (letterCode >= 0 &&
2104 	       (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
2105       // even more of a hack: 0.5 is a generic letter width
2106       curFontSize *= w / 0.5;
2107     } else if (anyCode >= 0 &&
2108 	       (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
2109       // better than nothing: 0.5 is a generic character width
2110       curFontSize *= w / 0.5;
2111     }
2112     fm = gfxFont->getFontMatrix();
2113     if (fm[0] != 0) {
2114       curFontSize *= fabs(fm[3] / fm[0]);
2115     }
2116   }
2117 }
2118 
beginWord(GfxState * state,double x0,double y0)2119 void TextPage::beginWord(GfxState *state, double x0, double y0) {
2120   GfxFont *gfxFont;
2121   double *fontm;
2122   double m[4], m2[4];
2123   int rot;
2124 
2125   // This check is needed because Type 3 characters can contain
2126   // text-drawing operations (when TextPage is being used via
2127   // {X,Win}SplashOutputDev rather than TextOutputDev).
2128   if (curWord) {
2129     ++nest;
2130     return;
2131   }
2132 
2133   // compute the rotation
2134   state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
2135   gfxFont = state->getFont();
2136   if (gfxFont && gfxFont->getType() == fontType3) {
2137     fontm = state->getFont()->getFontMatrix();
2138     m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
2139     m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
2140     m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
2141     m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
2142     m[0] = m2[0];
2143     m[1] = m2[1];
2144     m[2] = m2[2];
2145     m[3] = m2[3];
2146   }
2147   if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
2148     rot = (m[3] < 0) ? 0 : 2;
2149   } else {
2150     rot = (m[2] > 0) ? 1 : 3;
2151   }
2152 
2153   curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize);
2154 }
2155 
addChar(GfxState * state,double x,double y,double dx,double dy,CharCode c,int nBytes,Unicode * u,int uLen)2156 void TextPage::addChar(GfxState *state, double x, double y,
2157 		       double dx, double dy,
2158 		       CharCode c, int nBytes, Unicode *u, int uLen) {
2159   double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
2160   GBool overlap;
2161   int i;
2162 
2163   // subtract char and word spacing from the dx,dy values
2164   sp = state->getCharSpace();
2165   if (c == (CharCode)0x20) {
2166     sp += state->getWordSpace();
2167   }
2168   state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
2169   dx -= dx2;
2170   dy -= dy2;
2171   state->transformDelta(dx, dy, &w1, &h1);
2172 
2173   // throw away chars that aren't inside the page bounds
2174   // (and also do a sanity check on the character size)
2175   state->transform(x, y, &x1, &y1);
2176   if (x1 + w1 < 0 || x1 > pageWidth ||
2177       y1 + h1 < 0 || y1 > pageHeight ||
2178       w1 > pageWidth || h1 > pageHeight) {
2179     charPos += nBytes;
2180     return;
2181   }
2182 
2183   // check the tiny chars limit
2184   if (!globalParams->getTextKeepTinyChars() &&
2185       fabs(w1) < 3 && fabs(h1) < 3) {
2186     if (++nTinyChars > 50000) {
2187       charPos += nBytes;
2188       return;
2189     }
2190   }
2191 
2192   // break words at space character
2193   if (uLen == 1 && u[0] == (Unicode)0x20) {
2194     if (curWord) {
2195       ++curWord->charLen;
2196     }
2197     charPos += nBytes;
2198     endWord();
2199     return;
2200   }
2201 
2202   // start a new word if:
2203   // (1) this character doesn't fall in the right place relative to
2204   //     the end of the previous word (this places upper and lower
2205   //     constraints on the position deltas along both the primary
2206   //     and secondary axes), or
2207   // (2) this character overlaps the previous one (duplicated text), or
2208   // (3) the previous character was an overlap (we want each duplicated
2209   //     character to be in a word by itself at this stage),
2210   // (4) the font size has changed
2211   if (curWord && curWord->len > 0) {
2212     base = sp = delta = 0; // make gcc happy
2213     switch (curWord->rot) {
2214     case 0:
2215       base = y1;
2216       sp = x1 - curWord->xMax;
2217       delta = x1 - curWord->edge[curWord->len - 1];
2218       break;
2219     case 1:
2220       base = x1;
2221       sp = y1 - curWord->yMax;
2222       delta = y1 - curWord->edge[curWord->len - 1];
2223       break;
2224     case 2:
2225       base = y1;
2226       sp = curWord->xMin - x1;
2227       delta = curWord->edge[curWord->len - 1] - x1;
2228       break;
2229     case 3:
2230       base = x1;
2231       sp = curWord->yMin - y1;
2232       delta = curWord->edge[curWord->len - 1] - y1;
2233       break;
2234     }
2235     overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&
2236               fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
2237     if (overlap || lastCharOverlap ||
2238 	sp < -minDupBreakOverlap * curWord->fontSize ||
2239 	sp > minWordBreakSpace * curWord->fontSize ||
2240 	fabs(base - curWord->base) > 0.5 ||
2241 	curFontSize != curWord->fontSize) {
2242       endWord();
2243     }
2244     lastCharOverlap = overlap;
2245   } else {
2246     lastCharOverlap = gFalse;
2247   }
2248 
2249   if (uLen != 0) {
2250     // start a new word if needed
2251     if (!curWord) {
2252       beginWord(state, x, y);
2253     }
2254 
2255     // page rotation and/or transform matrices can cause text to be
2256     // drawn in reverse order -- in this case, swap the begin/end
2257     // coordinates and break text into individual chars
2258     if ((curWord->rot == 0 && w1 < 0) ||
2259         (curWord->rot == 1 && h1 < 0) ||
2260         (curWord->rot == 2 && w1 > 0) ||
2261         (curWord->rot == 3 && h1 > 0)) {
2262       endWord();
2263       beginWord(state, x + dx, y + dy);
2264       x1 += w1;
2265       y1 += h1;
2266       w1 = -w1;
2267       h1 = -h1;
2268     }
2269 
2270     // add the characters to the current word
2271     w1 /= uLen;
2272     h1 /= uLen;
2273     for (i = 0; i < uLen; ++i) {
2274       if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */
2275 	if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) {
2276 	  /* next code is a low surrogate */
2277 	  Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000;
2278 	  i++;
2279 	  curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, uu);
2280 	} else {
2281 	    /* missing low surrogate
2282 	     replace it with REPLACEMENT CHARACTER (U+FFFD) */
2283 	  curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, 0xfffd);
2284 	}
2285       } else if (u[i] >= 0xdc00 && u[i] < 0xe000) {
2286 	  /* invalid low surrogate
2287 	   replace it with REPLACEMENT CHARACTER (U+FFFD) */
2288 	curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, 0xfffd);
2289       } else {
2290 	curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, u[i]);
2291       }
2292     }
2293   }
2294   if (curWord) {
2295     curWord->charLen += nBytes;
2296   }
2297   charPos += nBytes;
2298 }
2299 
endWord()2300 void TextPage::endWord() {
2301   // This check is needed because Type 3 characters can contain
2302   // text-drawing operations (when TextPage is being used via
2303   // {X,Win}SplashOutputDev rather than TextOutputDev).
2304   if (nest > 0) {
2305     --nest;
2306     return;
2307   }
2308 
2309   if (curWord) {
2310     addWord(curWord);
2311     curWord = NULL;
2312   }
2313 }
2314 
addWord(TextWord * word)2315 void TextPage::addWord(TextWord *word) {
2316   // throw away zero-length words -- they don't have valid xMin/xMax
2317   // values, and they're useless anyway
2318   if (word->len == 0) {
2319     delete word;
2320     return;
2321   }
2322 
2323   if (rawOrder) {
2324     if (rawLastWord) {
2325       rawLastWord->next = word;
2326     } else {
2327       rawWords = word;
2328     }
2329     rawLastWord = word;
2330   } else {
2331     pools[word->rot]->addWord(word);
2332   }
2333 }
2334 
addUnderline(double x0,double y0,double x1,double y1)2335 void TextPage::addUnderline(double x0, double y0, double x1, double y1) {
2336   underlines->append(new TextUnderline(x0, y0, x1, y1));
2337 }
2338 
addLink(int xMin,int yMin,int xMax,int yMax,Link * link)2339 void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, Link *link) {
2340   links->append(new TextLink(xMin, yMin, xMax, yMax, link));
2341 }
2342 
coalesce(GBool physLayout,GBool doHTML)2343 void TextPage::coalesce(GBool physLayout, GBool doHTML) {
2344   UnicodeMap *uMap;
2345   TextPool *pool;
2346   TextWord *word0, *word1, *word2;
2347   TextLine *line;
2348   TextBlock *blkList, *blk, *lastBlk, *blk0, *blk1, *blk2;
2349   TextFlow *flow, *lastFlow;
2350   TextUnderline *underline;
2351   TextLink *link;
2352   int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx;
2353   double minBase, maxBase, newMinBase, newMaxBase;
2354   double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
2355   GBool found;
2356   int count[4];
2357   int lrCount;
2358   int col1, col2;
2359   int i, j, n;
2360 
2361   if (rawOrder) {
2362     primaryRot = 0;
2363     primaryLR = gTrue;
2364     return;
2365   }
2366 
2367   uMap = globalParams->getTextEncoding();
2368   blkList = NULL;
2369   lastBlk = NULL;
2370   nBlocks = 0;
2371   primaryRot = -1;
2372 
2373 #if 0 // for debugging
2374   printf("*** initial words ***\n");
2375   for (rot = 0; rot < 4; ++rot) {
2376     pool = pools[rot];
2377     for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
2378       for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
2379 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2380 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2381 	       word0->base, word0->fontSize, rot*90, word0->link);
2382 	for (i = 0; i < word0->len; ++i) {
2383 	  fputc(word0->text[i] & 0xff, stdout);
2384 	}
2385 	printf("'\n");
2386       }
2387     }
2388   }
2389   printf("\n");
2390 #endif
2391 
2392 #if 0 //~ for debugging
2393   for (i = 0; i < underlines->getLength(); ++i) {
2394     underline = (TextUnderline *)underlines->get(i);
2395     printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2396 	   underline->x0, underline->x1, underline->y0, underline->y1,
2397 	   underline->horiz);
2398   }
2399 #endif
2400 
2401   if (doHTML) {
2402 
2403     //----- handle underlining
2404     for (i = 0; i < underlines->getLength(); ++i) {
2405       underline = (TextUnderline *)underlines->get(i);
2406       if (underline->horiz) {
2407 	// rot = 0
2408 	if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2409 	  startBaseIdx = pools[0]->getBaseIdx(underline->y0 + minUnderlineGap);
2410 	  endBaseIdx = pools[0]->getBaseIdx(underline->y0 + maxUnderlineGap);
2411 	  for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2412 	    for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2413 	      //~ need to check the y value against the word baseline
2414 	      if (underline->x0 < word0->xMin + underlineSlack &&
2415 		  word0->xMax - underlineSlack < underline->x1) {
2416 		word0->underlined = gTrue;
2417 	      }
2418 	    }
2419 	  }
2420 	}
2421 
2422 	// rot = 2
2423 	if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2424 	  startBaseIdx = pools[2]->getBaseIdx(underline->y0 - maxUnderlineGap);
2425 	  endBaseIdx = pools[2]->getBaseIdx(underline->y0 - minUnderlineGap);
2426 	  for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2427 	    for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2428 	      if (underline->x0 < word0->xMin + underlineSlack &&
2429 		  word0->xMax - underlineSlack < underline->x1) {
2430 		word0->underlined = gTrue;
2431 	      }
2432 	    }
2433 	  }
2434 	}
2435       } else {
2436 	// rot = 1
2437 	if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2438 	  startBaseIdx = pools[1]->getBaseIdx(underline->x0 - maxUnderlineGap);
2439 	  endBaseIdx = pools[1]->getBaseIdx(underline->x0 - minUnderlineGap);
2440 	  for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2441 	    for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2442 	      if (underline->y0 < word0->yMin + underlineSlack &&
2443 		  word0->yMax - underlineSlack < underline->y1) {
2444 		word0->underlined = gTrue;
2445 	      }
2446 	    }
2447 	  }
2448 	}
2449 
2450 	// rot = 3
2451 	if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2452 	  startBaseIdx = pools[3]->getBaseIdx(underline->x0 + minUnderlineGap);
2453 	  endBaseIdx = pools[3]->getBaseIdx(underline->x0 + maxUnderlineGap);
2454 	  for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2455 	    for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2456 	      if (underline->y0 < word0->yMin + underlineSlack &&
2457 		  word0->yMax - underlineSlack < underline->y1) {
2458 		word0->underlined = gTrue;
2459 	      }
2460 	    }
2461 	  }
2462 	}
2463       }
2464     }
2465 
2466     //----- handle links
2467     for (i = 0; i < links->getLength(); ++i) {
2468       link = (TextLink *)links->get(i);
2469 
2470       // rot = 0
2471       if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2472 	startBaseIdx = pools[0]->getBaseIdx(link->yMin);
2473 	endBaseIdx = pools[0]->getBaseIdx(link->yMax);
2474 	for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2475 	  for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2476 	    if (link->xMin < word0->xMin + hyperlinkSlack &&
2477 		word0->xMax - hyperlinkSlack < link->xMax &&
2478 		link->yMin < word0->yMin + hyperlinkSlack &&
2479 		word0->yMax - hyperlinkSlack < link->yMax) {
2480 	      word0->link = link->link;
2481 	    }
2482 	  }
2483 	}
2484       }
2485 
2486       // rot = 2
2487       if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2488 	startBaseIdx = pools[2]->getBaseIdx(link->yMin);
2489 	endBaseIdx = pools[2]->getBaseIdx(link->yMax);
2490 	for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2491 	  for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2492 	    if (link->xMin < word0->xMin + hyperlinkSlack &&
2493 		word0->xMax - hyperlinkSlack < link->xMax &&
2494 		link->yMin < word0->yMin + hyperlinkSlack &&
2495 		word0->yMax - hyperlinkSlack < link->yMax) {
2496 	      word0->link = link->link;
2497 	    }
2498 	  }
2499 	}
2500       }
2501 
2502       // rot = 1
2503       if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2504 	startBaseIdx = pools[1]->getBaseIdx(link->xMin);
2505 	endBaseIdx = pools[1]->getBaseIdx(link->xMax);
2506 	for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2507 	  for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2508 	    if (link->yMin < word0->yMin + hyperlinkSlack &&
2509 		word0->yMax - hyperlinkSlack < link->yMax &&
2510 		link->xMin < word0->xMin + hyperlinkSlack &&
2511 		word0->xMax - hyperlinkSlack < link->xMax) {
2512 	      word0->link = link->link;
2513 	    }
2514 	  }
2515 	}
2516       }
2517 
2518       // rot = 3
2519       if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2520 	startBaseIdx = pools[3]->getBaseIdx(link->xMin);
2521 	endBaseIdx = pools[3]->getBaseIdx(link->xMax);
2522 	for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2523 	  for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2524 	    if (link->yMin < word0->yMin + hyperlinkSlack &&
2525 		word0->yMax - hyperlinkSlack < link->yMax &&
2526 		link->xMin < word0->xMin + hyperlinkSlack &&
2527 		word0->xMax - hyperlinkSlack < link->xMax) {
2528 	      word0->link = link->link;
2529 	    }
2530 	  }
2531 	}
2532       }
2533     }
2534   }
2535 
2536   //----- assemble the blocks
2537 
2538   //~ add an outer loop for writing mode (vertical text)
2539 
2540   // build blocks for each rotation value
2541   for (rot = 0; rot < 4; ++rot) {
2542     pool = pools[rot];
2543     poolMinBaseIdx = pool->minBaseIdx;
2544     count[rot] = 0;
2545 
2546     // add blocks until no more words are left
2547     while (1) {
2548 
2549       // find the first non-empty line in the pool
2550       for (;
2551 	   poolMinBaseIdx <= pool->maxBaseIdx &&
2552 	     !pool->getPool(poolMinBaseIdx);
2553 	   ++poolMinBaseIdx) ;
2554       if (poolMinBaseIdx > pool->maxBaseIdx) {
2555 	break;
2556       }
2557 
2558       // look for the left-most word in the first four lines of the
2559       // pool -- this avoids starting with a superscript word
2560       startBaseIdx = poolMinBaseIdx;
2561       for (baseIdx = poolMinBaseIdx + 1;
2562 	   baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
2563 	   ++baseIdx) {
2564 	if (!pool->getPool(baseIdx)) {
2565 	  continue;
2566 	}
2567 	if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
2568 	    < 0) {
2569 	  startBaseIdx = baseIdx;
2570 	}
2571       }
2572 
2573       // create a new block
2574       word0 = pool->getPool(startBaseIdx);
2575       pool->setPool(startBaseIdx, word0->next);
2576       word0->next = NULL;
2577       blk = new TextBlock(this, rot);
2578       blk->addWord(word0);
2579 
2580       fontSize = word0->fontSize;
2581       minBase = maxBase = word0->base;
2582       colSpace1 = minColSpacing1 * fontSize;
2583       colSpace2 = minColSpacing2 * fontSize;
2584       lineSpace = maxLineSpacingDelta * fontSize;
2585       intraLineSpace = maxIntraLineDelta * fontSize;
2586 
2587       // add words to the block
2588       do {
2589 	found = gFalse;
2590 
2591 	// look for words on the line above the current top edge of
2592 	// the block
2593 	newMinBase = minBase;
2594 	for (baseIdx = pool->getBaseIdx(minBase);
2595 	     baseIdx >= pool->getBaseIdx(minBase - lineSpace);
2596 	     --baseIdx) {
2597 	  word0 = NULL;
2598 	  word1 = pool->getPool(baseIdx);
2599 	  while (word1) {
2600 	    if (word1->base < minBase &&
2601 		word1->base >= minBase - lineSpace &&
2602 		((rot == 0 || rot == 2)
2603 		 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2604 		 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2605 		fabs(word1->fontSize - fontSize) <
2606 		  maxBlockFontSizeDelta1 * fontSize) {
2607 	      word2 = word1;
2608 	      if (word0) {
2609 		word0->next = word1->next;
2610 	      } else {
2611 		pool->setPool(baseIdx, word1->next);
2612 	      }
2613 	      word1 = word1->next;
2614 	      word2->next = NULL;
2615 	      blk->addWord(word2);
2616 	      found = gTrue;
2617 	      newMinBase = word2->base;
2618 	    } else {
2619 	      word0 = word1;
2620 	      word1 = word1->next;
2621 	    }
2622 	  }
2623 	}
2624 	minBase = newMinBase;
2625 
2626 	// look for words on the line below the current bottom edge of
2627 	// the block
2628 	newMaxBase = maxBase;
2629 	for (baseIdx = pool->getBaseIdx(maxBase);
2630 	     baseIdx <= pool->getBaseIdx(maxBase + lineSpace);
2631 	     ++baseIdx) {
2632 	  word0 = NULL;
2633 	  word1 = pool->getPool(baseIdx);
2634 	  while (word1) {
2635 	    if (word1->base > maxBase &&
2636 		word1->base <= maxBase + lineSpace &&
2637 		((rot == 0 || rot == 2)
2638 		 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2639 		 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2640 		fabs(word1->fontSize - fontSize) <
2641 		  maxBlockFontSizeDelta1 * fontSize) {
2642 	      word2 = word1;
2643 	      if (word0) {
2644 		word0->next = word1->next;
2645 	      } else {
2646 		pool->setPool(baseIdx, word1->next);
2647 	      }
2648 	      word1 = word1->next;
2649 	      word2->next = NULL;
2650 	      blk->addWord(word2);
2651 	      found = gTrue;
2652 	      newMaxBase = word2->base;
2653 	    } else {
2654 	      word0 = word1;
2655 	      word1 = word1->next;
2656 	    }
2657 	  }
2658 	}
2659 	maxBase = newMaxBase;
2660 
2661 	// look for words that are on lines already in the block, and
2662 	// that overlap the block horizontally
2663 	for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2664 	     baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2665 	     ++baseIdx) {
2666 	  word0 = NULL;
2667 	  word1 = pool->getPool(baseIdx);
2668 	  while (word1) {
2669 	    if (word1->base >= minBase - intraLineSpace &&
2670 		word1->base <= maxBase + intraLineSpace &&
2671 		((rot == 0 || rot == 2)
2672 		 ? (word1->xMin < blk->xMax + colSpace1 &&
2673 		    word1->xMax > blk->xMin - colSpace1)
2674 		 : (word1->yMin < blk->yMax + colSpace1 &&
2675 		    word1->yMax > blk->yMin - colSpace1)) &&
2676 		fabs(word1->fontSize - fontSize) <
2677 		  maxBlockFontSizeDelta2 * fontSize) {
2678 	      word2 = word1;
2679 	      if (word0) {
2680 		word0->next = word1->next;
2681 	      } else {
2682 		pool->setPool(baseIdx, word1->next);
2683 	      }
2684 	      word1 = word1->next;
2685 	      word2->next = NULL;
2686 	      blk->addWord(word2);
2687 	      found = gTrue;
2688 	    } else {
2689 	      word0 = word1;
2690 	      word1 = word1->next;
2691 	    }
2692 	  }
2693 	}
2694 
2695 	// only check for outlying words (the next two chunks of code)
2696 	// if we didn't find anything else
2697 	if (found) {
2698 	  continue;
2699 	}
2700 
2701 	// scan down the left side of the block, looking for words
2702 	// that are near (but not overlapping) the block; if there are
2703 	// three or fewer, add them to the block
2704 	n = 0;
2705 	for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2706 	     baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2707 	     ++baseIdx) {
2708 	  word1 = pool->getPool(baseIdx);
2709 	  while (word1) {
2710 	    if (word1->base >= minBase - intraLineSpace &&
2711 		word1->base <= maxBase + intraLineSpace &&
2712 		((rot == 0 || rot == 2)
2713 		 ? (word1->xMax <= blk->xMin &&
2714 		    word1->xMax > blk->xMin - colSpace2)
2715 		 : (word1->yMax <= blk->yMin &&
2716 		    word1->yMax > blk->yMin - colSpace2)) &&
2717 		fabs(word1->fontSize - fontSize) <
2718 		  maxBlockFontSizeDelta3 * fontSize) {
2719 	      ++n;
2720 	      break;
2721 	    }
2722 	    word1 = word1->next;
2723 	  }
2724 	}
2725 	if (n > 0 && n <= 3) {
2726 	  for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2727 	       baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2728 	       ++baseIdx) {
2729 	    word0 = NULL;
2730 	    word1 = pool->getPool(baseIdx);
2731 	    while (word1) {
2732 	      if (word1->base >= minBase - intraLineSpace &&
2733 		  word1->base <= maxBase + intraLineSpace &&
2734 		  ((rot == 0 || rot == 2)
2735 		   ? (word1->xMax <= blk->xMin &&
2736 		      word1->xMax > blk->xMin - colSpace2)
2737 		   : (word1->yMax <= blk->yMin &&
2738 		      word1->yMax > blk->yMin - colSpace2)) &&
2739 		  fabs(word1->fontSize - fontSize) <
2740 		    maxBlockFontSizeDelta3 * fontSize) {
2741 		word2 = word1;
2742 		if (word0) {
2743 		  word0->next = word1->next;
2744 		} else {
2745 		  pool->setPool(baseIdx, word1->next);
2746 		}
2747 		word1 = word1->next;
2748 		word2->next = NULL;
2749 		blk->addWord(word2);
2750 		if (word2->base < minBase) {
2751 		  minBase = word2->base;
2752 		} else if (word2->base > maxBase) {
2753 		  maxBase = word2->base;
2754 		}
2755 		found = gTrue;
2756 		break;
2757 	      } else {
2758 		word0 = word1;
2759 		word1 = word1->next;
2760 	      }
2761 	    }
2762 	  }
2763 	}
2764 
2765 	// scan down the right side of the block, looking for words
2766 	// that are near (but not overlapping) the block; if there are
2767 	// three or fewer, add them to the block
2768 	n = 0;
2769 	for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2770 	     baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2771 	     ++baseIdx) {
2772 	  word1 = pool->getPool(baseIdx);
2773 	  while (word1) {
2774 	    if (word1->base >= minBase - intraLineSpace &&
2775 		word1->base <= maxBase + intraLineSpace &&
2776 		((rot == 0 || rot == 2)
2777 		 ? (word1->xMin >= blk->xMax &&
2778 		    word1->xMin < blk->xMax + colSpace2)
2779 		 : (word1->yMin >= blk->yMax &&
2780 		    word1->yMin < blk->yMax + colSpace2)) &&
2781 		fabs(word1->fontSize - fontSize) <
2782 		  maxBlockFontSizeDelta3 * fontSize) {
2783 	      ++n;
2784 	      break;
2785 	    }
2786 	    word1 = word1->next;
2787 	  }
2788 	}
2789 	if (n > 0 && n <= 3) {
2790 	  for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2791 	       baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2792 	       ++baseIdx) {
2793 	    word0 = NULL;
2794 	    word1 = pool->getPool(baseIdx);
2795 	    while (word1) {
2796 	      if (word1->base >= minBase - intraLineSpace &&
2797 		  word1->base <= maxBase + intraLineSpace &&
2798 		  ((rot == 0 || rot == 2)
2799 		   ? (word1->xMin >= blk->xMax &&
2800 		      word1->xMin < blk->xMax + colSpace2)
2801 		   : (word1->yMin >= blk->yMax &&
2802 		      word1->yMin < blk->yMax + colSpace2)) &&
2803 		  fabs(word1->fontSize - fontSize) <
2804 		    maxBlockFontSizeDelta3 * fontSize) {
2805 		word2 = word1;
2806 		if (word0) {
2807 		  word0->next = word1->next;
2808 		} else {
2809 		  pool->setPool(baseIdx, word1->next);
2810 		}
2811 		word1 = word1->next;
2812 		word2->next = NULL;
2813 		blk->addWord(word2);
2814 		if (word2->base < minBase) {
2815 		  minBase = word2->base;
2816 		} else if (word2->base > maxBase) {
2817 		  maxBase = word2->base;
2818 		}
2819 		found = gTrue;
2820 		break;
2821 	      } else {
2822 		word0 = word1;
2823 		word1 = word1->next;
2824 	      }
2825 	    }
2826 	  }
2827 	}
2828 
2829       } while (found);
2830 
2831       //~ need to compute the primary writing mode (horiz/vert) in
2832       //~ addition to primary rotation
2833 
2834       // coalesce the block, and add it to the list
2835       blk->coalesce(uMap);
2836       if (lastBlk) {
2837 	lastBlk->next = blk;
2838       } else {
2839 	blkList = blk;
2840       }
2841       lastBlk = blk;
2842       count[rot] += blk->charCount;
2843       if (primaryRot < 0 || count[rot] > count[primaryRot]) {
2844 	primaryRot = rot;
2845       }
2846       ++nBlocks;
2847     }
2848   }
2849 
2850 #if 0 // for debugging
2851   printf("*** rotation ***\n");
2852   for (rot = 0; rot < 4; ++rot) {
2853     printf("  %d: %6d\n", rot, count[rot]);
2854   }
2855   printf("  primary rot = %d\n", primaryRot);
2856   printf("\n");
2857 #endif
2858 
2859 #if 0 // for debugging
2860   printf("*** blocks ***\n");
2861   for (blk = blkList; blk; blk = blk->next) {
2862     printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
2863 	   blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
2864     for (line = blk->lines; line; line = line->next) {
2865       printf("  line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
2866 	     line->xMin, line->xMax, line->yMin, line->yMax, line->base);
2867       for (word0 = line->words; word0; word0 = word0->next) {
2868 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2869 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2870 	       word0->base, word0->fontSize, word0->spaceAfter);
2871 	for (i = 0; i < word0->len; ++i) {
2872 	  fputc(word0->text[i] & 0xff, stdout);
2873 	}
2874 	printf("'\n");
2875       }
2876     }
2877   }
2878   printf("\n");
2879 #endif
2880 
2881   // determine the primary direction
2882   lrCount = 0;
2883   for (blk = blkList; blk; blk = blk->next) {
2884     for (line = blk->lines; line; line = line->next) {
2885       for (word0 = line->words; word0; word0 = word0->next) {
2886 	for (i = 0; i < word0->len; ++i) {
2887 	  if (unicodeTypeL(word0->text[i])) {
2888 	    ++lrCount;
2889 	  } else if (unicodeTypeR(word0->text[i])) {
2890 	    --lrCount;
2891 	  }
2892 	}
2893       }
2894     }
2895   }
2896   primaryLR = lrCount >= 0;
2897 
2898 #if 0 // for debugging
2899   printf("*** direction ***\n");
2900   printf("lrCount = %d\n", lrCount);
2901   printf("primaryLR = %d\n", primaryLR);
2902 #endif
2903 
2904   //----- column assignment
2905 
2906   // sort blocks into xy order for column assignment
2907   if (blocks)
2908     gfree (blocks);
2909   blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
2910   for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
2911     blocks[i] = blk;
2912   }
2913   qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
2914 
2915   // column assignment
2916   for (i = 0; i < nBlocks; ++i) {
2917     blk0 = blocks[i];
2918     col1 = 0;
2919     for (j = 0; j < i; ++j) {
2920       blk1 = blocks[j];
2921       col2 = 0; // make gcc happy
2922       switch (primaryRot) {
2923       case 0:
2924 	if (blk0->xMin > blk1->xMax) {
2925 	  col2 = blk1->col + blk1->nColumns + 3;
2926 	} else if (blk1->xMax == blk1->xMin) {
2927 	  col2 = blk1->col;
2928 	} else {
2929 	  col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
2930 				    (blk1->xMax - blk1->xMin)) *
2931 				   blk1->nColumns);
2932 	}
2933 	break;
2934       case 1:
2935 	if (blk0->yMin > blk1->yMax) {
2936 	  col2 = blk1->col + blk1->nColumns + 3;
2937 	} else if (blk1->yMax == blk1->yMin) {
2938 	  col2 = blk1->col;
2939 	} else {
2940 	  col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
2941 				    (blk1->yMax - blk1->yMin)) *
2942 				   blk1->nColumns);
2943 	}
2944 	break;
2945       case 2:
2946 	if (blk0->xMax < blk1->xMin) {
2947 	  col2 = blk1->col + blk1->nColumns + 3;
2948 	} else if (blk1->xMin == blk1->xMax) {
2949 	  col2 = blk1->col;
2950 	} else {
2951 	  col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
2952 				    (blk1->xMin - blk1->xMax)) *
2953 				   blk1->nColumns);
2954 	}
2955 	break;
2956       case 3:
2957 	if (blk0->yMax < blk1->yMin) {
2958 	  col2 = blk1->col + blk1->nColumns + 3;
2959 	} else if (blk1->yMin == blk1->yMax) {
2960 	  col2 = blk1->col;
2961 	} else {
2962 	  col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
2963 				    (blk1->yMin - blk1->yMax)) *
2964 				   blk1->nColumns);
2965 	}
2966 	break;
2967       }
2968       if (col2 > col1) {
2969 	col1 = col2;
2970       }
2971     }
2972     blk0->col = col1;
2973     for (line = blk0->lines; line; line = line->next) {
2974       for (j = 0; j <= line->len; ++j) {
2975 	line->col[j] += col1;
2976       }
2977     }
2978   }
2979 
2980 #if 0 // for debugging
2981   printf("*** blocks, after column assignment ***\n");
2982   for (blk = blkList; blk; blk = blk->next) {
2983     printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
2984 	   blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
2985 	   blk->nColumns);
2986     for (line = blk->lines; line; line = line->next) {
2987       printf("  line:\n");
2988       for (word0 = line->words; word0; word0 = word0->next) {
2989 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2990 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2991 	       word0->base, word0->fontSize, word0->spaceAfter);
2992 	for (i = 0; i < word0->len; ++i) {
2993 	  fputc(word0->text[i] & 0xff, stdout);
2994 	}
2995 	printf("'\n");
2996       }
2997     }
2998   }
2999   printf("\n");
3000 #endif
3001 
3002   //----- reading order sort
3003 
3004   // compute space on left and right sides of each block
3005   for (i = 0; i < nBlocks; ++i) {
3006     blk0 = blocks[i];
3007     for (j = 0; j < nBlocks; ++j) {
3008       blk1 = blocks[j];
3009       if (blk1 != blk0) {
3010 	blk0->updatePriMinMax(blk1);
3011       }
3012     }
3013   }
3014 
3015 #if 0 // for debugging
3016   printf("PAGE\n");
3017 #endif
3018 
3019   int sortPos = 0;
3020   GBool *visited = (GBool *)gmallocn(nBlocks, sizeof(GBool));
3021   for (i = 0; i < nBlocks; i++) {
3022     visited[i] = gFalse;
3023   }
3024 
3025   double bxMin0, byMin0, bxMin1, byMin1;
3026   int numTables = 0;
3027   int tableId = -1;
3028   int correspondenceX, correspondenceY;
3029   double xCentre1, yCentre1, xCentre2, yCentre2;
3030   double xCentre3, yCentre3, xCentre4, yCentre4;
3031   double deltaX, deltaY;
3032   TextBlock *fblk2 = NULL, *fblk3 = NULL, *fblk4 = NULL;
3033 
3034   for (blk1 = blkList; blk1; blk1 = blk1->next) {
3035     blk1->ExMin = blk1->xMin;
3036     blk1->ExMax = blk1->xMax;
3037     blk1->EyMin = blk1->yMin;
3038     blk1->EyMax = blk1->yMax;
3039 
3040     bxMin0 = DBL_MAX;
3041     byMin0 = DBL_MAX;
3042     bxMin1 = DBL_MAX;
3043     byMin1 = DBL_MAX;
3044 
3045     fblk2 = NULL;
3046     fblk3 = NULL;
3047     fblk4 = NULL;
3048 
3049     /*  find fblk2, fblk3 and fblk4 so that
3050      *  fblk2 is on the right of blk1 and overlap with blk1 in y axis
3051      *  fblk3 is under blk1 and overlap with blk1 in x axis
3052      *  fblk4 is under blk1 and on the right of blk1
3053      *  and they are closest to blk1
3054      */
3055     for (blk2 = blkList; blk2; blk2 = blk2->next) {
3056       if (blk2 != blk1) {
3057         if (blk2->yMin <= blk1->yMax &&
3058             blk2->yMax >= blk1->yMin &&
3059             blk2->xMin > blk1->xMax &&
3060             blk2->xMin < bxMin0) {
3061           bxMin0 = blk2->xMin;
3062           fblk2 = blk2;
3063         } else if (blk2->xMin <= blk1->xMax &&
3064                    blk2->xMax >= blk1->xMin &&
3065                    blk2->yMin > blk1->yMax &&
3066                    blk2->yMin < byMin0) {
3067           byMin0 = blk2->yMin;
3068           fblk3 = blk2;
3069         } else if (blk2->xMin > blk1->xMax &&
3070                    blk2->xMin < bxMin1 &&
3071                    blk2->yMin > blk1->yMax &&
3072                    blk2->yMin < byMin1) {
3073           bxMin1 = blk2->xMin;
3074           byMin1 = blk2->yMin;
3075           fblk4 = blk2;
3076         }
3077       }
3078     }
3079 
3080     /*  fblk4 can not overlap with fblk3 in x and with fblk2 in y
3081      *  fblk2 can not overlap with fblk3 in x and y
3082      *  fblk4 has to overlap with fblk3 in y and with fblk2 in x
3083      */
3084     if (fblk2 != NULL &&
3085         fblk3 != NULL &&
3086         fblk4 != NULL) {
3087       if (((fblk3->xMin <= fblk4->xMax && fblk3->xMax >= fblk4->xMin) ||
3088            (fblk2->yMin <= fblk4->yMax && fblk2->yMax >= fblk4->yMin) ||
3089            (fblk2->xMin <= fblk3->xMax && fblk2->xMax >= fblk3->xMin) ||
3090            (fblk2->yMin <= fblk3->yMax && fblk2->yMax >= fblk3->yMin)) ||
3091           !(fblk4->xMin <= fblk2->xMax && fblk4->xMax >= fblk2->xMin &&
3092             fblk4->yMin <= fblk3->yMax && fblk4->yMax >= fblk3->yMin)) {
3093         fblk2 = NULL;
3094         fblk3 = NULL;
3095         fblk4 = NULL;
3096       }
3097     }
3098 
3099     // if we found any then look whether they form a table
3100     if (fblk2 != NULL &&
3101         fblk3 != NULL &&
3102         fblk4 != NULL) {
3103       tableId = -1;
3104       correspondenceX = 0;
3105       correspondenceY = 0;
3106       deltaX = 0.0;
3107       deltaY = 0.0;
3108 
3109       if (blk1->lines && blk1->lines->words)
3110         deltaX = blk1->lines->words->getFontSize();
3111       if (fblk2->lines && fblk2->lines->words)
3112         deltaX = deltaX < fblk2->lines->words->getFontSize() ?
3113                    deltaX : fblk2->lines->words->getFontSize();
3114       if (fblk3->lines && fblk3->lines->words)
3115         deltaX = deltaX < fblk3->lines->words->getFontSize() ?
3116                    deltaX : fblk3->lines->words->getFontSize();
3117       if (fblk4->lines && fblk4->lines->words)
3118         deltaX = deltaX < fblk4->lines->words->getFontSize() ?
3119                    deltaX : fblk4->lines->words->getFontSize();
3120 
3121       deltaY = deltaX;
3122 
3123       deltaX *= minColSpacing1;
3124       deltaY *= maxIntraLineDelta;
3125 
3126       xCentre1 = (blk1->xMax + blk1->xMin) / 2.0;
3127       yCentre1 = (blk1->yMax + blk1->yMin) / 2.0;
3128       xCentre2 = (fblk2->xMax + fblk2->xMin) / 2.0;
3129       yCentre2 = (fblk2->yMax + fblk2->yMin) / 2.0;
3130       xCentre3 = (fblk3->xMax + fblk3->xMin) / 2.0;
3131       yCentre3 = (fblk3->yMax + fblk3->yMin) / 2.0;
3132       xCentre4 = (fblk4->xMax + fblk4->xMin) / 2.0;
3133       yCentre4 = (fblk4->yMax + fblk4->yMin) / 2.0;
3134 
3135       // are blocks centrally aligned in x ?
3136       if (fabs (xCentre1 - xCentre3) <= deltaX &&
3137           fabs (xCentre2 - xCentre4) <= deltaX)
3138         correspondenceX++;
3139 
3140       // are blocks centrally aligned in y ?
3141       if (fabs (yCentre1 - yCentre2) <= deltaY &&
3142           fabs (yCentre3 - yCentre4) <= deltaY)
3143         correspondenceY++;
3144 
3145       // are blocks aligned to the left ?
3146       if (fabs (blk1->xMin - fblk3->xMin) <= deltaX &&
3147           fabs (fblk2->xMin - fblk4->xMin) <= deltaX)
3148         correspondenceX++;
3149 
3150       // are blocks aligned to the right ?
3151       if (fabs (blk1->xMax - fblk3->xMax) <= deltaX &&
3152           fabs (fblk2->xMax - fblk4->xMax) <= deltaX)
3153         correspondenceX++;
3154 
3155       // are blocks aligned to the top ?
3156       if (fabs (blk1->yMin - fblk2->yMin) <= deltaY &&
3157           fabs (fblk3->yMin - fblk4->yMin) <= deltaY)
3158         correspondenceY++;
3159 
3160       // are blocks aligned to the bottom ?
3161       if (fabs (blk1->yMax - fblk2->yMax) <= deltaY &&
3162           fabs (fblk3->yMax - fblk4->yMax) <= deltaY)
3163         correspondenceY++;
3164 
3165       // are blocks aligned in x and y ?
3166       if (correspondenceX > 0 &&
3167           correspondenceY > 0) {
3168 
3169         // find maximal tableId
3170         tableId = tableId < fblk4->tableId ? fblk4->tableId : tableId;
3171         tableId = tableId < fblk3->tableId ? fblk3->tableId : tableId;
3172         tableId = tableId < fblk2->tableId ? fblk2->tableId : tableId;
3173         tableId = tableId < blk1->tableId ? blk1->tableId : tableId;
3174 
3175         // if the tableId is -1, then we found new table
3176         if (tableId < 0) {
3177           tableId = numTables;
3178           numTables++;
3179         }
3180 
3181         blk1->tableId = tableId;
3182         fblk2->tableId = tableId;
3183         fblk3->tableId = tableId;
3184         fblk4->tableId = tableId;
3185       }
3186     }
3187   }
3188 
3189   /*  set extended bounding boxes of all table entries
3190    *  so that they contain whole table
3191    *  (we need to process whole table size when comparing it
3192    *   with regular text blocks)
3193    */
3194   PDFRectangle *envelopes = new PDFRectangle [numTables];
3195   TextBlock **ending_blocks = new TextBlock* [numTables];
3196 
3197   for (i = 0; i < numTables; i++) {
3198     envelopes[i].x1 = DBL_MAX;
3199     envelopes[i].x2 = DBL_MIN;
3200     envelopes[i].y1 = DBL_MAX;
3201     envelopes[i].y2 = DBL_MIN;
3202   }
3203 
3204   for (blk1 = blkList; blk1; blk1 = blk1->next) {
3205     if (blk1->tableId >= 0) {
3206       if (blk1->ExMin < envelopes[blk1->tableId].x1) {
3207         envelopes[blk1->tableId].x1 = blk1->ExMin;
3208         if (!blk1->page->primaryLR)
3209           ending_blocks[blk1->tableId] = blk1;
3210       }
3211 
3212       if (blk1->ExMax > envelopes[blk1->tableId].x2) {
3213         envelopes[blk1->tableId].x2 = blk1->ExMax;
3214         if (blk1->page->primaryLR)
3215           ending_blocks[blk1->tableId] = blk1;
3216       }
3217 
3218       envelopes[blk1->tableId].y1 = blk1->EyMin < envelopes[blk1->tableId].y1 ?
3219                                       blk1->EyMin : envelopes[blk1->tableId].y1;
3220       envelopes[blk1->tableId].y2 = blk1->EyMax > envelopes[blk1->tableId].y2 ?
3221                                       blk1->EyMax : envelopes[blk1->tableId].y2;
3222     }
3223   }
3224 
3225   for (blk1 = blkList; blk1; blk1 = blk1->next) {
3226     if (blk1->tableId >= 0 &&
3227         blk1->xMin <= ending_blocks[blk1->tableId]->xMax &&
3228         blk1->xMax >= ending_blocks[blk1->tableId]->xMin) {
3229       blk1->tableEnd = gTrue;
3230     }
3231   }
3232 
3233   for (blk1 = blkList; blk1; blk1 = blk1->next) {
3234     if (blk1->tableId >= 0) {
3235       blk1->ExMin = envelopes[blk1->tableId].x1;
3236       blk1->ExMax = envelopes[blk1->tableId].x2;
3237       blk1->EyMin = envelopes[blk1->tableId].y1;
3238       blk1->EyMax = envelopes[blk1->tableId].y2;
3239     }
3240   }
3241   delete[] envelopes;
3242   delete[] ending_blocks;
3243 
3244 
3245   /*  set extended bounding boxes of all other blocks
3246    *  so that they extend in x without hitting neighbours
3247    */
3248   for (blk1 = blkList; blk1; blk1 = blk1->next) {
3249     if (!blk1->tableId >= 0) {
3250       double xMax = DBL_MAX;
3251       double xMin = DBL_MIN;
3252 
3253       for (blk2 = blkList; blk2; blk2 = blk2->next) {
3254         if (blk2 == blk1)
3255            continue;
3256 
3257         if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) {
3258           if (blk2->xMin < xMax && blk2->xMin > blk1->xMax)
3259             xMax = blk2->xMin;
3260 
3261           if (blk2->xMax > xMin && blk2->xMax < blk1->xMin)
3262             xMin = blk2->xMax;
3263         }
3264       }
3265 
3266       for (blk2 = blkList; blk2; blk2 = blk2->next) {
3267         if (blk2 == blk1)
3268            continue;
3269 
3270         if (blk2->xMax > blk1->ExMax &&
3271             blk2->xMax <= xMax &&
3272             blk2->yMin >= blk1->yMax) {
3273           blk1->ExMax = blk2->xMax;
3274         }
3275 
3276         if (blk2->xMin < blk1->ExMin &&
3277             blk2->xMin >= xMin &&
3278             blk2->yMin >= blk1->yMax)
3279           blk1->ExMin = blk2->xMin;
3280       }
3281     }
3282   }
3283 
3284   i = -1;
3285   for (blk1 = blkList; blk1; blk1 = blk1->next) {
3286     i++;
3287     sortPos = blk1->visitDepthFirst(blkList, i, blocks, sortPos, visited);
3288   }
3289   if (visited) {
3290     gfree(visited);
3291   }
3292 
3293 #if 0 // for debugging
3294   printf("*** blocks, after ro sort ***\n");
3295   for (i = 0; i < nBlocks; ++i) {
3296     blk = blocks[i];
3297     printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
3298 	   blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
3299 	   blk->priMin, blk->priMax);
3300     for (line = blk->lines; line; line = line->next) {
3301       printf("  line:\n");
3302       for (word0 = line->words; word0; word0 = word0->next) {
3303 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3304 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3305 	       word0->base, word0->fontSize, word0->spaceAfter);
3306 	for (j = 0; j < word0->len; ++j) {
3307 	  fputc(word0->text[j] & 0xff, stdout);
3308 	}
3309 	printf("'\n");
3310       }
3311     }
3312   }
3313   printf("\n");
3314   fflush(stdout);
3315 #endif
3316 
3317   // build the flows
3318   //~ this needs to be adjusted for writing mode (vertical text)
3319   //~ this also needs to account for right-to-left column ordering
3320   flow = NULL;
3321   while (flows) {
3322     flow = flows;
3323     flows = flows->next;
3324     delete flow;
3325   }
3326   flows = lastFlow = NULL;
3327   // assume blocks are already in reading order,
3328   // and construct flows accordingly.
3329   for (i = 0; i < nBlocks; i++) {
3330     blk = blocks[i];
3331     blk->next = NULL;
3332     if (flow) {
3333       blk1 = blocks[i - 1];
3334       blkSpace = maxBlockSpacing * blk1->lines->words->fontSize;
3335       if (blk1->secondaryDelta(blk) <= blkSpace &&
3336 	  blk->isBelow(blk1) &&
3337 	  flow->blockFits(blk, blk1)) {
3338 	flow->addBlock(blk);
3339 	continue;
3340       }
3341     }
3342     flow = new TextFlow(this, blk);
3343     if (lastFlow) {
3344       lastFlow->next = flow;
3345     } else {
3346       flows = flow;
3347     }
3348     lastFlow = flow;
3349   }
3350 
3351 #if 0 // for debugging
3352   printf("*** flows ***\n");
3353   for (flow = flows; flow; flow = flow->next) {
3354     printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
3355 	   flow->xMin, flow->xMax, flow->yMin, flow->yMax,
3356 	   flow->priMin, flow->priMax);
3357     for (blk = flow->blocks; blk; blk = blk->next) {
3358       printf("  block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
3359 	     blk->rot, blk->ExMin, blk->ExMax, blk->EyMin, blk->EyMax,
3360 	     blk->priMin, blk->priMax);
3361       for (line = blk->lines; line; line = line->next) {
3362 	printf("    line:\n");
3363 	for (word0 = line->words; word0; word0 = word0->next) {
3364 	  printf("      word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3365 		 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3366 		 word0->base, word0->fontSize, word0->spaceAfter);
3367 	  for (i = 0; i < word0->len; ++i) {
3368 	    fputc(word0->text[i] & 0xff, stdout);
3369 	  }
3370 	  printf("'\n");
3371 	}
3372       }
3373     }
3374   }
3375   printf("\n");
3376 #endif
3377 
3378   if (uMap) {
3379     uMap->decRefCnt();
3380   }
3381 }
3382 
findText(Unicode * s,int len,GBool startAtTop,GBool stopAtBottom,GBool startAtLast,GBool stopAtLast,GBool caseSensitive,GBool backward,double * xMin,double * yMin,double * xMax,double * yMax)3383 GBool TextPage::findText(Unicode *s, int len,
3384 			 GBool startAtTop, GBool stopAtBottom,
3385 			 GBool startAtLast, GBool stopAtLast,
3386 			 GBool caseSensitive, GBool backward,
3387 			 double *xMin, double *yMin,
3388 			 double *xMax, double *yMax) {
3389   TextBlock *blk;
3390   TextLine *line;
3391   Unicode *s2, *txt;
3392   Unicode *p;
3393   int txtSize, m, i, j, k;
3394   double xStart, yStart, xStop, yStop;
3395   double xMin0, yMin0, xMax0, yMax0;
3396   double xMin1, yMin1, xMax1, yMax1;
3397   GBool found;
3398 
3399   //~ needs to handle right-to-left text
3400 
3401   if (rawOrder) {
3402     return gFalse;
3403   }
3404 
3405   // convert the search string to uppercase
3406   if (!caseSensitive) {
3407     s2 = unicodeNormalizeNFKC(s, len, &len, NULL);
3408     for (i = 0; i < len; ++i) {
3409       s2[i] = unicodeToUpper(s2[i]);
3410     }
3411   } else {
3412     s2 = unicodeNormalizeNFKC(s, len, &len, NULL);
3413   }
3414 
3415   txt = NULL;
3416   txtSize = 0;
3417 
3418   xStart = yStart = xStop = yStop = 0;
3419   if (startAtLast && haveLastFind) {
3420     xStart = lastFindXMin;
3421     yStart = lastFindYMin;
3422   } else if (!startAtTop) {
3423     xStart = *xMin;
3424     yStart = *yMin;
3425   }
3426   if (stopAtLast && haveLastFind) {
3427     xStop = lastFindXMin;
3428     yStop = lastFindYMin;
3429   } else if (!stopAtBottom) {
3430     xStop = *xMax;
3431     yStop = *yMax;
3432   }
3433 
3434   found = gFalse;
3435   xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3436   xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3437 
3438   for (i = backward ? nBlocks - 1 : 0;
3439        backward ? i >= 0 : i < nBlocks;
3440        i += backward ? -1 : 1) {
3441     blk = blocks[i];
3442 
3443     // check: is the block above the top limit?
3444     if (!startAtTop && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
3445       continue;
3446     }
3447 
3448     // check: is the block below the bottom limit?
3449     if (!stopAtBottom && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
3450       break;
3451     }
3452 
3453     for (line = blk->lines; line; line = line->next) {
3454 
3455       // check: is the line above the top limit?
3456       if (!startAtTop &&
3457 	  (backward ? line->yMin > yStart : line->yMin < yStart)) {
3458 	continue;
3459       }
3460 
3461       // check: is the line below the bottom limit?
3462       if (!stopAtBottom &&
3463 	  (backward ? line->yMin < yStop : line->yMin > yStop)) {
3464 	continue;
3465       }
3466 
3467       if (!line->normalized)
3468 	line->normalized = unicodeNormalizeNFKC(line->text, line->len,
3469 						&line->normalized_len,
3470 						&line->normalized_idx);
3471       // convert the line to uppercase
3472       m = line->normalized_len;
3473       if (!caseSensitive) {
3474 	if (m > txtSize) {
3475 	  txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
3476 	  txtSize = m;
3477 	}
3478 	for (k = 0; k < m; ++k) {
3479 	  txt[k] = unicodeToUpper(line->normalized[k]);
3480 	  }
3481 	  } else {
3482 	txt = line->normalized;
3483 	  }
3484 
3485       // search each position in this line
3486       j = backward ? m - len : 0;
3487       p = txt + j;
3488       while (backward ? j >= 0 : j <= m - len) {
3489 
3490 	// compare the strings
3491 	for (k = 0; k < len; ++k) {
3492 	  if (p[k] != s2[k]) {
3493 	    break;
3494 	  }
3495 	}
3496 
3497 	// found it
3498 	if (k == len) {
3499 	  // where s2 matches a subsequence of a compatibility equivalence
3500 	  // decomposition, highlight the entire glyph, since we don't know
3501 	  // the internal layout of subglyph components
3502 	  int normStart = line->normalized_idx[j];
3503 	  int normAfterEnd = line->normalized_idx[j + len - 1] + 1;
3504 	  switch (line->rot) {
3505 	  case 0:
3506 	    xMin1 = line->edge[normStart];
3507 	    xMax1 = line->edge[normAfterEnd];
3508 	    yMin1 = line->yMin;
3509 	    yMax1 = line->yMax;
3510 	    break;
3511 	  case 1:
3512 	    xMin1 = line->xMin;
3513 	    xMax1 = line->xMax;
3514 	    yMin1 = line->edge[normStart];
3515 	    yMax1 = line->edge[normAfterEnd];
3516 	    break;
3517 	  case 2:
3518 	    xMin1 = line->edge[normAfterEnd];
3519 	    xMax1 = line->edge[normStart];
3520 	    yMin1 = line->yMin;
3521 	    yMax1 = line->yMax;
3522 	    break;
3523 	  case 3:
3524 	    xMin1 = line->xMin;
3525 	    xMax1 = line->xMax;
3526 	    yMin1 = line->edge[normAfterEnd];
3527 	    yMax1 = line->edge[normStart];
3528 	    break;
3529 	  }
3530 	  if (backward) {
3531 	    if ((startAtTop ||
3532 		 yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) &&
3533 		(stopAtBottom ||
3534 		 yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
3535 	      if (!found ||
3536 		  yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
3537 		xMin0 = xMin1;
3538 		xMax0 = xMax1;
3539 		yMin0 = yMin1;
3540 		yMax0 = yMax1;
3541 		found = gTrue;
3542 	      }
3543 	    }
3544 	  } else {
3545 	    if ((startAtTop ||
3546 		 yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
3547 		(stopAtBottom ||
3548 		 yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
3549 	      if (!found ||
3550 		  yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
3551 		xMin0 = xMin1;
3552 		xMax0 = xMax1;
3553 		yMin0 = yMin1;
3554 		yMax0 = yMax1;
3555 		found = gTrue;
3556 	      }
3557 	    }
3558 	  }
3559 	}
3560 	if (backward) {
3561 	  --j;
3562 	  --p;
3563 	} else {
3564 	  ++j;
3565 	  ++p;
3566 	}
3567       }
3568     }
3569     }
3570 
3571   gfree(s2);
3572   if (!caseSensitive) {
3573     gfree(txt);
3574   }
3575 
3576   if (found) {
3577     *xMin = xMin0;
3578     *xMax = xMax0;
3579     *yMin = yMin0;
3580     *yMax = yMax0;
3581     lastFindXMin = xMin0;
3582     lastFindYMin = yMin0;
3583     haveLastFind = gTrue;
3584     return gTrue;
3585   }
3586 
3587   return gFalse;
3588 }
3589 
getText(double xMin,double yMin,double xMax,double yMax)3590 GooString *TextPage::getText(double xMin, double yMin,
3591 			   double xMax, double yMax) {
3592   GooString *s;
3593   UnicodeMap *uMap;
3594   GBool isUnicode;
3595   TextBlock *blk;
3596   TextLine *line;
3597   TextLineFrag *frags;
3598   int nFrags, fragsSize;
3599   TextLineFrag *frag;
3600   char space[8], eol[16];
3601   int spaceLen, eolLen;
3602   int lastRot;
3603   double x, y, delta;
3604   int col, idx0, idx1, i, j;
3605   GBool multiLine, oneRot;
3606 
3607   s = new GooString();
3608 
3609   // get the output encoding
3610   if (!(uMap = globalParams->getTextEncoding())) {
3611     return s;
3612   }
3613 
3614   if (rawOrder) {
3615     TextWord*  word;
3616     char mbc[16];
3617     int  mbc_len;
3618 
3619     for (word = rawWords; word && word <= rawLastWord; word = word->next) {
3620       for (j = 0; j < word->getLength(); ++j) {
3621         double gXMin, gXMax, gYMin, gYMax;
3622         word->getCharBBox(j, &gXMin, &gYMin, &gXMax, &gYMax);
3623         if (xMin <= gXMin && gXMax <= xMax && yMin <= gYMin && gYMax <= yMax)
3624         {
3625           mbc_len = uMap->mapUnicode( *(word->getChar(j)), mbc, sizeof(mbc) );
3626           s->append(mbc, mbc_len);
3627         }
3628       }
3629     }
3630     return s;
3631   }
3632 
3633   isUnicode = uMap->isUnicode();
3634   spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3635   eolLen = 0; // make gcc happy
3636   switch (globalParams->getTextEOL()) {
3637   case eolUnix:
3638     eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3639     break;
3640   case eolDOS:
3641     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3642     eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3643     break;
3644   case eolMac:
3645     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3646     break;
3647   }
3648 
3649   //~ writing mode (horiz/vert)
3650 
3651   // collect the line fragments that are in the rectangle
3652   fragsSize = 256;
3653   frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
3654   nFrags = 0;
3655   lastRot = -1;
3656   oneRot = gTrue;
3657   for (i = 0; i < nBlocks; ++i) {
3658     blk = blocks[i];
3659     if (xMin < blk->xMax && blk->xMin < xMax &&
3660 	yMin < blk->yMax && blk->yMin < yMax) {
3661       for (line = blk->lines; line; line = line->next) {
3662 	if (xMin < line->xMax && line->xMin < xMax &&
3663 	    yMin < line->yMax && line->yMin < yMax) {
3664 	  idx0 = idx1 = -1;
3665 	  switch (line->rot) {
3666 	  case 0:
3667 	    y = 0.5 * (line->yMin + line->yMax);
3668 	    if (yMin < y && y < yMax) {
3669 	      j = 0;
3670 	      while (j < line->len) {
3671 		if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
3672 		  idx0 = j;
3673 		  break;
3674 		}
3675 		++j;
3676 	      }
3677 	      j = line->len - 1;
3678 	      while (j >= 0) {
3679 		if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
3680 		  idx1 = j;
3681 		  break;
3682 		}
3683 		--j;
3684 	      }
3685 	    }
3686 	    break;
3687 	  case 1:
3688 	    x = 0.5 * (line->xMin + line->xMax);
3689 	    if (xMin < x && x < xMax) {
3690 	      j = 0;
3691 	      while (j < line->len) {
3692 		if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
3693 		  idx0 = j;
3694 		  break;
3695 		}
3696 		++j;
3697 	      }
3698 	      j = line->len - 1;
3699 	      while (j >= 0) {
3700 		if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
3701 		  idx1 = j;
3702 		  break;
3703 		}
3704 		--j;
3705 	      }
3706 	    }
3707 	    break;
3708 	  case 2:
3709 	    y = 0.5 * (line->yMin + line->yMax);
3710 	    if (yMin < y && y < yMax) {
3711 	      j = 0;
3712 	      while (j < line->len) {
3713 		if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
3714 		  idx0 = j;
3715 		  break;
3716 		}
3717 		++j;
3718 	      }
3719 	      j = line->len - 1;
3720 	      while (j >= 0) {
3721 		if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
3722 		  idx1 = j;
3723 		  break;
3724 		}
3725 		--j;
3726 	      }
3727 	    }
3728 	    break;
3729 	  case 3:
3730 	    x = 0.5 * (line->xMin + line->xMax);
3731 	    if (xMin < x && x < xMax) {
3732 	      j = 0;
3733 	      while (j < line->len) {
3734 		if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
3735 		  idx0 = j;
3736 		  break;
3737 		}
3738 		++j;
3739 	      }
3740 	      j = line->len - 1;
3741 	      while (j >= 0) {
3742 		if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
3743 		  idx1 = j;
3744 		  break;
3745 		}
3746 		--j;
3747 	      }
3748 	    }
3749 	    break;
3750 	  }
3751 	  if (idx0 >= 0 && idx1 >= 0) {
3752 	    if (nFrags == fragsSize) {
3753 	      fragsSize *= 2;
3754 	      frags = (TextLineFrag *)
3755 		          greallocn(frags, fragsSize, sizeof(TextLineFrag));
3756 	    }
3757 	    frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
3758 	    ++nFrags;
3759 	    if (lastRot >= 0 && line->rot != lastRot) {
3760 	      oneRot = gFalse;
3761 	    }
3762 	    lastRot = line->rot;
3763 	  }
3764 	}
3765       }
3766     }
3767   }
3768 
3769   // sort the fragments and generate the string
3770   if (nFrags > 0) {
3771 
3772     for (i = 0; i < nFrags; ++i) {
3773       frags[i].computeCoords(oneRot);
3774     }
3775     assignColumns(frags, nFrags, oneRot);
3776 
3777     // if all lines in the region have the same rotation, use it;
3778     // otherwise, use the page's primary rotation
3779     if (oneRot) {
3780       qsort(frags, nFrags, sizeof(TextLineFrag),
3781 	    &TextLineFrag::cmpYXLineRot);
3782     } else {
3783       qsort(frags, nFrags, sizeof(TextLineFrag),
3784 	    &TextLineFrag::cmpYXPrimaryRot);
3785     }
3786     i = 0;
3787     while (i < nFrags) {
3788       delta = maxIntraLineDelta * frags[i].line->words->fontSize;
3789       for (j = i+1;
3790 	   j < nFrags && fabs(frags[j].base - frags[i].base) < delta;
3791 	   ++j) ;
3792       qsort(frags + i, j - i, sizeof(TextLineFrag),
3793 	    oneRot ? &TextLineFrag::cmpXYColumnLineRot
3794 	           : &TextLineFrag::cmpXYColumnPrimaryRot);
3795       i = j;
3796     }
3797 
3798     col = 0;
3799     multiLine = gFalse;
3800     for (i = 0; i < nFrags; ++i) {
3801       frag = &frags[i];
3802 
3803       // insert a return
3804       if (frag->col < col ||
3805 	  (i > 0 && fabs(frag->base - frags[i-1].base) >
3806 	              maxIntraLineDelta * frags[i-1].line->words->fontSize)) {
3807 	s->append(eol, eolLen);
3808 	col = 0;
3809 	multiLine = gTrue;
3810       }
3811 
3812       // column alignment
3813       for (; col < frag->col; ++col) {
3814 	s->append(space, spaceLen);
3815       }
3816 
3817       // get the fragment text
3818       col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3819     }
3820 
3821     if (multiLine) {
3822       s->append(eol, eolLen);
3823     }
3824   }
3825 
3826   gfree(frags);
3827   uMap->decRefCnt();
3828 
3829   return s;
3830 }
3831 
3832 class TextSelectionVisitor {
3833 public:
3834   TextSelectionVisitor (TextPage *page);
~TextSelectionVisitor()3835   virtual ~TextSelectionVisitor () { }
3836   virtual void visitBlock (TextBlock *block,
3837 			   TextLine *begin,
3838 			   TextLine *end,
3839 			   PDFRectangle *selection) = 0;
3840   virtual void visitLine (TextLine *line,
3841 			  TextWord *begin,
3842 			  TextWord *end,
3843 			  int edge_begin,
3844 			  int edge_end,
3845 			  PDFRectangle *selection) = 0;
3846   virtual void visitWord (TextWord *word, int begin, int end,
3847 			  PDFRectangle *selection) = 0;
3848 
3849 protected:
3850   TextPage *page;
3851 };
3852 
TextSelectionVisitor(TextPage * page)3853 TextSelectionVisitor::TextSelectionVisitor (TextPage *page)
3854   : page(page)
3855 {
3856 }
3857 
3858 
3859 class TextSelectionDumper : public TextSelectionVisitor {
3860 public:
3861   TextSelectionDumper(TextPage *page);
3862   virtual ~TextSelectionDumper();
3863 
visitBlock(TextBlock * block,TextLine * begin,TextLine * end,PDFRectangle * selection)3864   virtual void visitBlock (TextBlock *block,
3865 			   TextLine *begin,
3866 			   TextLine *end,
3867 			   PDFRectangle *selection) { };
3868   virtual void visitLine (TextLine *line,
3869 			  TextWord *begin,
3870 			  TextWord *end,
3871 			  int edge_begin,
3872 			  int edge_end,
3873 			  PDFRectangle *selection);
visitWord(TextWord * word,int begin,int end,PDFRectangle * selection)3874   virtual void visitWord (TextWord *word, int begin, int end,
3875 			  PDFRectangle *selection) { };
3876 
3877   GooString *getText(void);
3878 
3879 private:
3880   TextLineFrag *frags;
3881   int nFrags, fragsSize;
3882 };
3883 
TextSelectionDumper(TextPage * page)3884 TextSelectionDumper::TextSelectionDumper(TextPage *page)
3885     : TextSelectionVisitor(page)
3886 {
3887   fragsSize = 256;
3888   frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
3889   nFrags = 0;
3890 }
3891 
~TextSelectionDumper()3892 TextSelectionDumper::~TextSelectionDumper()
3893 {
3894   gfree(frags);
3895 }
3896 
visitLine(TextLine * line,TextWord * begin,TextWord * end,int edge_begin,int edge_end,PDFRectangle * selection)3897 void TextSelectionDumper::visitLine (TextLine *line,
3898 				     TextWord *begin,
3899 				     TextWord *end,
3900 				     int edge_begin,
3901 				     int edge_end,
3902 				     PDFRectangle *selection)
3903 {
3904   if (nFrags == fragsSize) {
3905     fragsSize *= 2;
3906     frags = (TextLineFrag *) grealloc(frags, fragsSize * sizeof(TextLineFrag));
3907   }
3908 
3909   frags[nFrags].init(line, edge_begin, edge_end - edge_begin);
3910   ++nFrags;
3911 
3912 }
3913 
getText(void)3914 GooString *TextSelectionDumper::getText (void)
3915 {
3916   GooString *s;
3917   TextLineFrag *frag;
3918   int i, j;
3919   GBool multiLine;
3920   UnicodeMap *uMap;
3921   char space[8], eol[16];
3922   int spaceLen, eolLen;
3923   GooList *strings = NULL;
3924   int actual_table = -1;
3925   int actual_line = -1;
3926   int last_length = 0;
3927   TextBlock *actual_block = NULL;
3928 
3929   s = new GooString();
3930 
3931   uMap = globalParams->getTextEncoding();
3932 
3933   if (uMap == NULL)
3934       return s;
3935 
3936   spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3937   eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3938 
3939   if (nFrags > 0) {
3940     multiLine = gFalse;
3941     for (i = 0; i < nFrags; ++i) {
3942       frag = &frags[i];
3943 
3944       if (actual_table >= 0 && frag->line->blk->tableId < 0) {
3945         for (j = 0; j < strings->getLength (); j++) {
3946           s->append ((GooString*) strings->get (j));
3947           s->append (eol, eolLen);
3948           delete ((GooString*) strings->get (j));
3949         }
3950         delete strings;
3951         strings = NULL;
3952         actual_table = -1;
3953         actual_line = -1;
3954         actual_block = NULL;
3955       }
3956 
3957       // a table
3958       if (frag->line->blk->tableId >= 0) {
3959         if (actual_table == -1) {
3960           strings = new GooList();
3961           actual_table = frag->line->blk->tableId;
3962           actual_block = frag->line->blk;
3963           actual_line = -1;
3964         }
3965 
3966         // the same block
3967         if (actual_block == frag->line->blk) {
3968           actual_line++;
3969           if (actual_line >= strings->getLength ()) {
3970             GooString *t = new GooString ();
3971             // add some spaces to have this block correctly aligned
3972             if (actual_line > 0)
3973               for (j = 0; j < ((GooString*) (strings->get (actual_line - 1)))->getLength() - last_length - 1; j++)
3974                 t->append (space, spaceLen);
3975             strings->append (t);
3976           }
3977         }
3978         // another block
3979         else {
3980           // previous block ended its row
3981           if (actual_block->tableEnd) {
3982             for (j = 0; j < strings->getLength (); j++) {
3983               s->append ((GooString*) strings->get (j));
3984               s->append (eol, eolLen);
3985               delete ((GooString*) strings->get (j));
3986             }
3987             delete strings;
3988 
3989             strings = new GooList();
3990             GooString *t = new GooString ();
3991             strings->append (t);
3992           }
3993           actual_block = frag->line->blk;
3994           actual_line = 0;
3995         }
3996 
3997         page->dumpFragment(frag->line->text + frag->start, frag->len, uMap, ((GooString*) strings->get (actual_line)));
3998         last_length = frag->len;
3999 
4000         if (!frag->line->blk->tableEnd) {
4001           ((GooString*) strings->get (actual_line))->append (space, spaceLen);
4002         }
4003       }
4004       // not a table
4005       else {
4006         page->dumpFragment (frag->line->text + frag->start, frag->len, uMap, s);
4007         s->append (eol, eolLen);
4008       }
4009     }
4010 
4011     if (strings != NULL) {
4012       for (j = 0; j < strings->getLength (); j++) {
4013         s->append((GooString*) strings->get (j));
4014         s->append(eol, eolLen);
4015         delete ((GooString*) strings->get (j));
4016       }
4017       delete strings;
4018       strings = NULL;
4019       actual_table = -1;
4020       actual_line = -1;
4021       actual_block = NULL;
4022     }
4023   }
4024 
4025   uMap->decRefCnt();
4026 
4027   return s;
4028 }
4029 
4030 class TextSelectionSizer : public TextSelectionVisitor {
4031 public:
4032   TextSelectionSizer(TextPage *page, double scale);
~TextSelectionSizer()4033   ~TextSelectionSizer() { }
4034 
visitBlock(TextBlock * block,TextLine * begin,TextLine * end,PDFRectangle * selection)4035   virtual void visitBlock (TextBlock *block,
4036 			   TextLine *begin,
4037 			   TextLine *end,
4038 			   PDFRectangle *selection) { };
4039   virtual void visitLine (TextLine *line,
4040 			  TextWord *begin,
4041 			  TextWord *end,
4042 			  int edge_begin,
4043 			  int edge_end,
4044 			  PDFRectangle *selection);
visitWord(TextWord * word,int begin,int end,PDFRectangle * selection)4045   virtual void visitWord (TextWord *word, int begin, int end,
4046 			  PDFRectangle *selection) { };
4047 
getRegion()4048   GooList *getRegion () { return list; }
4049 
4050 private:
4051   GooList *list;
4052   double scale;
4053 };
4054 
TextSelectionSizer(TextPage * page,double scale)4055 TextSelectionSizer::TextSelectionSizer(TextPage *page, double scale)
4056   : TextSelectionVisitor(page),
4057     scale(scale)
4058 {
4059   list = new GooList();
4060 }
4061 
visitLine(TextLine * line,TextWord * begin,TextWord * end,int edge_begin,int edge_end,PDFRectangle * selection)4062 void TextSelectionSizer::visitLine (TextLine *line,
4063 				    TextWord *begin,
4064 				    TextWord *end,
4065 				    int edge_begin,
4066 				    int edge_end,
4067 				    PDFRectangle *selection)
4068 {
4069   PDFRectangle *rect;
4070   double x1, y1, x2, y2, margin;
4071 
4072   margin = (line->yMax - line->yMin) / 8;
4073   x1 = line->edge[edge_begin];
4074   y1 = line->yMin - margin;
4075   x2 = line->edge[edge_end];
4076   y2 = line->yMax + margin;
4077 
4078   rect = new PDFRectangle (floor (x1 * scale),
4079 			   floor (y1 * scale),
4080 			   ceil (x2 * scale),
4081 			   ceil (y2 * scale));
4082   list->append (rect);
4083 }
4084 
4085 
4086 class TextSelectionPainter : public TextSelectionVisitor {
4087 public:
4088   TextSelectionPainter(TextPage *page,
4089 		       double scale,
4090 		       int rotation,
4091 		       OutputDev *out,
4092 		       GfxColor *box_color,
4093 		       GfxColor *glyph_color);
4094   ~TextSelectionPainter();
4095 
visitBlock(TextBlock * block,TextLine * begin,TextLine * end,PDFRectangle * selection)4096   virtual void visitBlock (TextBlock *block,
4097 			   TextLine *begin,
4098 			   TextLine *end,
4099 			   PDFRectangle *selection) { };
4100   virtual void visitLine (TextLine *line,
4101 			  TextWord *begin,
4102 			  TextWord *end,
4103 			  int edge_begin,
4104 			  int edge_end,
4105 			  PDFRectangle *selection);
4106   virtual void visitWord (TextWord *word, int begin, int end,
4107 			  PDFRectangle *selection);
4108 
4109 private:
4110   OutputDev *out;
4111   GfxColor *box_color, *glyph_color;
4112   GfxState *state;
4113 };
4114 
TextSelectionPainter(TextPage * page,double scale,int rotation,OutputDev * out,GfxColor * box_color,GfxColor * glyph_color)4115 TextSelectionPainter::TextSelectionPainter(TextPage *page,
4116 					   double scale,
4117 					   int rotation,
4118 					   OutputDev *out,
4119 					   GfxColor *box_color,
4120 					   GfxColor *glyph_color)
4121   : TextSelectionVisitor(page),
4122     out(out),
4123     box_color(box_color),
4124     glyph_color(glyph_color)
4125 {
4126   PDFRectangle box(0, 0, page->pageWidth, page->pageHeight);
4127 
4128   state = new GfxState(72 * scale, 72 * scale, &box, rotation, gFalse);
4129 
4130   out->startPage (0, state);
4131   out->setDefaultCTM (state->getCTM());
4132 
4133   state->setTextMat(1, 0, 0, -1, 0, 0);
4134   state->setFillColorSpace(new GfxDeviceRGBColorSpace());
4135 
4136 }
4137 
~TextSelectionPainter()4138 TextSelectionPainter::~TextSelectionPainter()
4139 {
4140   out->endPage ();
4141 
4142   delete state;
4143 }
4144 
visitLine(TextLine * line,TextWord * begin,TextWord * end,int edge_begin,int edge_end,PDFRectangle * selection)4145 void TextSelectionPainter::visitLine (TextLine *line,
4146 				      TextWord *begin,
4147 				      TextWord *end,
4148 				      int edge_begin,
4149 				      int edge_end,
4150 				      PDFRectangle *selection)
4151 {
4152   double x1, y1, x2, y2, margin;
4153   Matrix ctm, ictm;
4154 
4155   state->setFillColor(box_color);
4156   out->updateFillColor(state);
4157 
4158   margin = (line->yMax - line->yMin) / 8;
4159   x1 = floor (line->edge[edge_begin]);
4160   y1 = floor (line->yMin - margin);
4161   x2 = ceil (line->edge[edge_end]);
4162   y2 = ceil (line->yMax + margin);
4163 
4164   state->getCTM (&ctm);
4165   ctm.transform(line->edge[edge_begin], line->yMin - margin, &x1, &y1);
4166   ctm.transform(line->edge[edge_end], line->yMax + margin, &x2, &y2);
4167 
4168   x1 = floor (x1);
4169   y1 = floor (y1);
4170   x2 = ceil (x2);
4171   y2 = ceil (y2);
4172 
4173   ctm.invertTo (&ictm);
4174   ictm.transform(x1, y1, &x1, &y1);
4175   ictm.transform(x2, y2, &x2, &y2);
4176 
4177   state->moveTo(x1, y1);
4178   state->lineTo(x2, y1);
4179   state->lineTo(x2, y2);
4180   state->lineTo(x1, y2);
4181   state->closePath();
4182 
4183   out->fill(state);
4184   state->clearPath();
4185 }
4186 
visitWord(TextWord * word,int begin,int end,PDFRectangle * selection)4187 void TextSelectionPainter::visitWord (TextWord *word, int begin, int end,
4188 				      PDFRectangle *selection)
4189 {
4190   GooString *string;
4191   int i;
4192 
4193   state->setFillColor(glyph_color);
4194   out->updateFillColor(state);
4195   word->font->gfxFont->incRefCnt();
4196   state->setFont(word->font->gfxFont, word->fontSize);
4197   out->updateFont(state);
4198 
4199   /* The only purpose of this string is to let the output device query
4200    * it's length.  Might want to change this interface later. */
4201 
4202   string = new GooString ((char *) word->charcode, end - begin);
4203 
4204   out->beginString(state, string);
4205 
4206   for (i = begin; i < end; i++)
4207     out->drawChar(state, word->edge[i], word->base, 0, 0, 0, 0,
4208 		  word->charcode[i], 1, NULL, 0);
4209 
4210   out->endString(state);
4211 
4212   delete string;
4213 }
4214 
visitSelection(TextSelectionVisitor * visitor,PDFRectangle * selection,SelectionStyle style)4215 void TextWord::visitSelection(TextSelectionVisitor *visitor,
4216 			      PDFRectangle *selection,
4217 			      SelectionStyle style)
4218 {
4219   int i, begin, end;
4220   double mid;
4221 
4222   begin = len;
4223   end = 0;
4224   for (i = 0; i < len; i++) {
4225     mid = (edge[i] + edge[i + 1]) / 2;
4226     if (selection->x1 < mid || selection->x2 < mid)
4227       if (i < begin)
4228 	begin = i;
4229     if (mid < selection->x1 || mid < selection->x2)
4230       end = i + 1;
4231   }
4232 
4233   /* Skip empty selection. */
4234   if (end <= begin)
4235     return;
4236 
4237   visitor->visitWord (this, begin, end, selection);
4238 }
4239 
visitSelection(TextSelectionVisitor * visitor,PDFRectangle * selection,SelectionStyle style)4240 void TextLine::visitSelection(TextSelectionVisitor *visitor,
4241 			      PDFRectangle *selection,
4242 			      SelectionStyle style) {
4243   TextWord *p, *begin, *end, *current;
4244   int i, edge_begin, edge_end;
4245   PDFRectangle child_selection;
4246 
4247   begin = NULL;
4248   end = NULL;
4249   current = NULL;
4250   for (p = words; p != NULL; p = p->next) {
4251     if (blk->page->primaryLR) {
4252       if ((selection->x1 < p->xMax) ||
4253 	  (selection->x2 < p->xMax))
4254         if (begin == NULL)
4255 	  begin = p;
4256 
4257       if (((selection->x1 > p->xMin) ||
4258 	   (selection->x2 > p->xMin)) && (begin != NULL)) {
4259         end = p->next;
4260         current = p;
4261       }
4262     } else {
4263       if ((selection->x1 > p->xMin) ||
4264 	  (selection->x2 > p->xMin))
4265         if (begin == NULL)
4266 	  begin = p;
4267 
4268       if (((selection->x1 < p->xMax) ||
4269 	   (selection->x2 < p->xMax)) && (begin != NULL)) {
4270         end = p->next;
4271         current = p;
4272       }
4273     }
4274   }
4275 
4276   if (!current)
4277     current = begin;
4278 
4279   child_selection = *selection;
4280   if (style == selectionStyleWord) {
4281     child_selection.x1 = begin ? begin->xMin : xMin;
4282     if (end && end->xMax != -1) {
4283       child_selection.x2 = current->xMax;
4284     } else {
4285       child_selection.x2 = xMax;
4286     }
4287   }
4288 
4289   edge_begin = len;
4290   edge_end = 0;
4291   for (i = 0; i < len; i++) {
4292     double mid = (edge[i] + edge[i + 1]) /  2;
4293     if (child_selection.x1 < mid || child_selection.x2 < mid)
4294       if (i < edge_begin)
4295 	edge_begin = i;
4296     if (mid < child_selection.x2 || mid < child_selection.x1)
4297       edge_end = i + 1;
4298   }
4299 
4300   /* Skip empty selection. */
4301   if (edge_end <= edge_begin)
4302     return;
4303 
4304   visitor->visitLine (this, begin, end, edge_begin, edge_end,
4305 		      &child_selection);
4306 
4307   for (p = begin; p != end; p = p->next)
4308     p->visitSelection (visitor, &child_selection, style);
4309 }
4310 
visitSelection(TextSelectionVisitor * visitor,PDFRectangle * selection,SelectionStyle style)4311 void TextBlock::visitSelection(TextSelectionVisitor *visitor,
4312 			       PDFRectangle *selection,
4313 			       SelectionStyle style) {
4314   PDFRectangle child_selection;
4315   double x[2], y[2], d, best_d[2];
4316   TextLine *p, *best_line[2];
4317   int i, count = 0, best_count[2], start, stop;
4318   GBool all[2];
4319 
4320   x[0] = selection->x1;
4321   y[0] = selection->y1;
4322   x[1] = selection->x2;
4323   y[1] = selection->y2;
4324 
4325   for (i = 0; i < 2; i++) {
4326     // the first/last lines are often not nearest
4327     // the corners, so we have to force them to be
4328     // selected when the selection runs outside this
4329     // block.
4330     if (page->primaryLR) {
4331       all[i] = x[i] >= this->xMax && y[i] >= this->yMax;
4332       if (x[i] <= this->xMin && y[i] <= this->yMin) {
4333 	best_line[i] = this->lines;
4334 	best_count[i] = 1;
4335       } else {
4336 	best_line[i] = NULL;
4337 	best_count[i] = 0;
4338       }
4339     } else {
4340       all[i] = x[i] <= this->xMin && y[i] >= this->yMax;
4341       if (x[i] >= this->xMax && y[i] <= this->yMin) {
4342 	best_line[i] = this->lines;
4343 	best_count[i] = 1;
4344       } else {
4345 	best_line[i] = NULL;
4346 	best_count[i] = 0;
4347       }
4348     }
4349     best_d[i] = 0;
4350   }
4351 
4352   // find the nearest line to the selection points
4353   // using the manhattan distance.
4354   for (p = this->lines; p; p = p->next) {
4355     count++;
4356     for (i = 0; i < 2; i++) {
4357       d = fmax(p->xMin - x[i], 0.0) +
4358 	fmax(x[i] - p->xMax, 0.0) +
4359 	fmax(p->yMin - y[i], 0.0) +
4360 	fmax(y[i] - p->yMax, 0.0);
4361       if (!best_line[i] || all[i] ||
4362 	  d < best_d[i]) {
4363 	best_line[i] = p;
4364 	best_count[i] = count;
4365 	best_d[i] = d;
4366       }
4367     }
4368   }
4369   // assert: best is always set.
4370   if (!best_line[0] || !best_line[1]) {
4371     return;
4372   }
4373 
4374   // Now decide which point was first.
4375   if (best_count[0] < best_count[1] ||
4376       (best_count[0] == best_count[1] &&
4377        y[0] < y[1])) {
4378     start = 0;
4379     stop = 1;
4380   } else {
4381     start = 1;
4382     stop = 0;
4383   }
4384 
4385   visitor->visitBlock(this, best_line[start], best_line[stop], selection);
4386 
4387   for (p = best_line[start]; p; p = p->next) {
4388     if (page->primaryLR) {
4389       child_selection.x1 = p->xMin;
4390       child_selection.x2 = p->xMax;
4391     } else {
4392       child_selection.x1 = p->xMax;
4393       child_selection.x2 = p->xMin;
4394     }
4395     child_selection.y1 = p->yMin;
4396     child_selection.y2 = p->yMax;
4397     if (style == selectionStyleLine) {
4398       if (p == best_line[start]) {
4399 	child_selection.x1 = 0;
4400 	child_selection.y1 = 0;
4401       }
4402       if (p == best_line[stop]) {
4403 	child_selection.x2 = page->pageWidth;
4404 	child_selection.y2 = page->pageHeight;
4405       }
4406     } else {
4407       if (p == best_line[start]) {
4408 	child_selection.x1 = fmax(p->xMin, fmin(p->xMax, x[start]));
4409 	child_selection.y1 = fmax(p->yMin, fmin(p->yMax, y[start]));
4410       }
4411       if (p == best_line[stop]) {
4412 	child_selection.x2 = fmax(p->xMin, fmin(p->xMax, x[stop]));
4413 	child_selection.y2 = fmax(p->yMin, fmin(p->yMax, y[stop]));
4414       }
4415     }
4416     p->visitSelection(visitor, &child_selection, style);
4417     if (p == best_line[stop]) {
4418       return;
4419     }
4420   }
4421 }
4422 
visitSelection(TextSelectionVisitor * visitor,PDFRectangle * selection,SelectionStyle style)4423 void TextPage::visitSelection(TextSelectionVisitor *visitor,
4424 			      PDFRectangle *selection,
4425 			      SelectionStyle style)
4426 {
4427   PDFRectangle child_selection;
4428   double x[2], y[2], d, best_d[2];
4429   double xMin, yMin, xMax, yMax;
4430   TextFlow *flow, *best_flow[2];
4431   TextBlock *blk, *best_block[2];
4432   int i, count = 0, best_count[2], start, stop;
4433 
4434   if (!flows)
4435     return;
4436 
4437   x[0] = selection->x1;
4438   y[0] = selection->y1;
4439   x[1] = selection->x2;
4440   y[1] = selection->y2;
4441 
4442   xMin = pageWidth;
4443   yMin = pageHeight;
4444   xMax = 0.0;
4445   yMax = 0.0;
4446 
4447   for (i = 0; i < 2; i++) {
4448     best_block[i] = NULL;
4449     best_flow[i] = NULL;
4450     best_count[i] = 0;
4451     best_d[i] = 0;
4452   }
4453 
4454   // find the nearest blocks to the selection points
4455   // using the manhattan distance.
4456   for (flow = flows; flow; flow = flow->next) {
4457     for (blk = flow->blocks; blk; blk = blk->next) {
4458       count++;
4459       // the first/last blocks in reading order are
4460       // often not the closest to the page corners;
4461       // track the corners, force those blocks to
4462       // be selected if the selection runs across
4463       // multiple pages.
4464       xMin = fmin(xMin, blk->xMin);
4465       yMin = fmin(yMin, blk->yMin);
4466       xMax = fmax(xMax, blk->xMax);
4467       yMax = fmax(yMax, blk->yMax);
4468       for (i = 0; i < 2; i++) {
4469 	d = fmax(blk->xMin - x[i], 0.0) +
4470 	  fmax(x[i] - blk->xMax, 0.0) +
4471 	  fmax(blk->yMin - y[i], 0.0) +
4472 	  fmax(y[i] - blk->yMax, 0.0);
4473 	if (!best_block[i] ||
4474 	    d < best_d[i] ||
4475 	    (!blk->next && !flow->next &&
4476 	     x[i] > xMax && y[i] > yMax)) {
4477 	  best_block[i] = blk;
4478 	  best_flow[i] = flow;
4479 	  best_count[i] = count;
4480 	  best_d[i] = d;
4481 	}
4482       }
4483     }
4484   }
4485   for (i = 0; i < 2; i++) {
4486     if (primaryLR) {
4487       if (x[i] < xMin && y[i] < yMin) {
4488 	best_block[i] = flows->blocks;
4489 	best_flow[i] = flows;
4490 	best_count[i] = 1;
4491       }
4492     } else {
4493       if (x[i] > xMax && y[i] < yMin) {
4494 	best_block[i] = flows->blocks;
4495 	best_flow[i] = flows;
4496 	best_count[i] = 1;
4497       }
4498     }
4499   }
4500   // assert: best is always set.
4501   if (!best_block[0] || !best_block[1]) {
4502     return;
4503   }
4504 
4505   // Now decide which point was first.
4506   if (best_count[0] < best_count[1] ||
4507       (best_count[0] == best_count[1] && y[0] < y[1])) {
4508     start = 0;
4509     stop = 1;
4510   } else {
4511     start = 1;
4512     stop = 0;
4513   }
4514 
4515   for (flow = best_flow[start]; flow; flow = flow->next) {
4516     if (flow == best_flow[start]) {
4517       blk = best_block[start];
4518     } else {
4519       blk = flow->blocks;
4520     }
4521     for (; blk; blk = blk->next) {
4522       if (primaryLR) {
4523 	child_selection.x1 = blk->xMin;
4524 	child_selection.x2 = blk->xMax;
4525       } else {
4526 	child_selection.x1 = blk->xMax;
4527 	child_selection.x2 = blk->xMin;
4528       }
4529       child_selection.y1 = blk->yMin;
4530       child_selection.y2 = blk->yMax;
4531       if (blk == best_block[start]) {
4532 	child_selection.x1 = fmax(blk->xMin, fmin(blk->xMax, x[start]));
4533 	child_selection.y1 = fmax(blk->yMin, fmin(blk->yMax, y[start]));
4534       }
4535       if (blk == best_block[stop]) {
4536 	child_selection.x2 = fmax(blk->xMin, fmin(blk->xMax, x[stop]));
4537 	child_selection.y2 = fmax(blk->yMin, fmin(blk->yMax, y[stop]));
4538 	blk->visitSelection(visitor, &child_selection, style);
4539 	return;
4540       }
4541       blk->visitSelection(visitor, &child_selection, style);
4542     }
4543   }
4544 }
4545 
drawSelection(OutputDev * out,double scale,int rotation,PDFRectangle * selection,SelectionStyle style,GfxColor * glyph_color,GfxColor * box_color)4546 void TextPage::drawSelection(OutputDev *out,
4547 			     double scale,
4548 			     int rotation,
4549 			     PDFRectangle *selection,
4550 			     SelectionStyle style,
4551 			     GfxColor *glyph_color, GfxColor *box_color)
4552 {
4553   TextSelectionPainter painter(this, scale, rotation,
4554 			       out, box_color, glyph_color);
4555 
4556   visitSelection(&painter, selection, style);
4557 }
4558 
getSelectionRegion(PDFRectangle * selection,SelectionStyle style,double scale)4559 GooList *TextPage::getSelectionRegion(PDFRectangle *selection,
4560 				      SelectionStyle style,
4561 				      double scale) {
4562   TextSelectionSizer sizer(this, scale);
4563 
4564   visitSelection(&sizer, selection, style);
4565 
4566   return sizer.getRegion();
4567 }
4568 
getSelectionText(PDFRectangle * selection,SelectionStyle style)4569 GooString *TextPage::getSelectionText(PDFRectangle *selection,
4570 				      SelectionStyle style)
4571 {
4572   TextSelectionDumper dumper(this);
4573 
4574   visitSelection(&dumper, selection, style);
4575 
4576   return dumper.getText();
4577 }
4578 
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax)4579 GBool TextPage::findCharRange(int pos, int length,
4580 			      double *xMin, double *yMin,
4581 			      double *xMax, double *yMax) {
4582   TextBlock *blk;
4583   TextLine *line;
4584   TextWord *word;
4585   double xMin0, xMax0, yMin0, yMax0;
4586   double xMin1, xMax1, yMin1, yMax1;
4587   GBool first;
4588   int i, j0, j1;
4589 
4590   if (rawOrder) {
4591     return gFalse;
4592   }
4593 
4594   //~ this doesn't correctly handle:
4595   //~ - ranges split across multiple lines (the highlighted region
4596   //~   is the bounding box of all the parts of the range)
4597   //~ - cases where characters don't convert one-to-one into Unicode
4598   first = gTrue;
4599   xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
4600   xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
4601   for (i = 0; i < nBlocks; ++i) {
4602     blk = blocks[i];
4603     for (line = blk->lines; line; line = line->next) {
4604       for (word = line->words; word; word = word->next) {
4605 	if (pos < word->charPos + word->charLen &&
4606 	    word->charPos < pos + length) {
4607 	  j0 = pos - word->charPos;
4608 	  if (j0 < 0) {
4609 	    j0 = 0;
4610 	  }
4611 	  j1 = pos + length - 1 - word->charPos;
4612 	  if (j1 >= word->len) {
4613 	    j1 = word->len - 1;
4614 	  }
4615 	  switch (line->rot) {
4616 	  case 0:
4617 	    xMin1 = word->edge[j0];
4618 	    xMax1 = word->edge[j1 + 1];
4619 	    yMin1 = word->yMin;
4620 	    yMax1 = word->yMax;
4621 	    break;
4622 	  case 1:
4623 	    xMin1 = word->xMin;
4624 	    xMax1 = word->xMax;
4625 	    yMin1 = word->edge[j0];
4626 	    yMax1 = word->edge[j1 + 1];
4627 	    break;
4628 	  case 2:
4629 	    xMin1 = word->edge[j1 + 1];
4630 	    xMax1 = word->edge[j0];
4631 	    yMin1 = word->yMin;
4632 	    yMax1 = word->yMax;
4633 	    break;
4634 	  case 3:
4635 	    xMin1 = word->xMin;
4636 	    xMax1 = word->xMax;
4637 	    yMin1 = word->edge[j1 + 1];
4638 	    yMax1 = word->edge[j0];
4639 	    break;
4640 	  }
4641 	  if (first || xMin1 < xMin0) {
4642 	    xMin0 = xMin1;
4643 	  }
4644 	  if (first || xMax1 > xMax0) {
4645 	    xMax0 = xMax1;
4646 	  }
4647 	  if (first || yMin1 < yMin0) {
4648 	    yMin0 = yMin1;
4649 	  }
4650 	  if (first || yMax1 > yMax0) {
4651 	    yMax0 = yMax1;
4652 	  }
4653 	  first = gFalse;
4654 	}
4655       }
4656     }
4657   }
4658   if (!first) {
4659     *xMin = xMin0;
4660     *xMax = xMax0;
4661     *yMin = yMin0;
4662     *yMax = yMax0;
4663     return gTrue;
4664   }
4665   return gFalse;
4666 }
4667 
dump(void * outputStream,TextOutputFunc outputFunc,GBool physLayout)4668 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
4669 		    GBool physLayout) {
4670   UnicodeMap *uMap;
4671   TextFlow *flow;
4672   TextBlock *blk;
4673   TextLine *line;
4674   TextLineFrag *frags;
4675   TextWord *word;
4676   int nFrags, fragsSize;
4677   TextLineFrag *frag;
4678   char space[8], eol[16], eop[8];
4679   int spaceLen, eolLen, eopLen;
4680   GBool pageBreaks;
4681   GooString *s;
4682   double delta;
4683   int col, i, j, d, n;
4684 
4685   // get the output encoding
4686   if (!(uMap = globalParams->getTextEncoding())) {
4687     return;
4688   }
4689   spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
4690   eolLen = 0; // make gcc happy
4691   switch (globalParams->getTextEOL()) {
4692   case eolUnix:
4693     eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
4694     break;
4695   case eolDOS:
4696     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
4697     eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
4698     break;
4699   case eolMac:
4700     eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
4701     break;
4702   }
4703   eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
4704   pageBreaks = globalParams->getTextPageBreaks();
4705 
4706   //~ writing mode (horiz/vert)
4707 
4708   // output the page in raw (content stream) order
4709   if (rawOrder) {
4710 
4711     for (word = rawWords; word; word = word->next) {
4712       s = new GooString();
4713       dumpFragment(word->text, word->len, uMap, s);
4714       (*outputFunc)(outputStream, s->getCString(), s->getLength());
4715       delete s;
4716       if (word->next &&
4717 	  fabs(word->next->base - word->base) <
4718 	    maxIntraLineDelta * word->fontSize) {
4719 	if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
4720 	  (*outputFunc)(outputStream, space, spaceLen);
4721 	}
4722       } else {
4723 	(*outputFunc)(outputStream, eol, eolLen);
4724       }
4725     }
4726 
4727   // output the page, maintaining the original physical layout
4728   } else if (physLayout) {
4729 
4730     // collect the line fragments for the page and sort them
4731     fragsSize = 256;
4732     frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
4733     nFrags = 0;
4734     for (i = 0; i < nBlocks; ++i) {
4735       blk = blocks[i];
4736       for (line = blk->lines; line; line = line->next) {
4737 	if (nFrags == fragsSize) {
4738 	  fragsSize *= 2;
4739 	  frags = (TextLineFrag *)greallocn(frags,
4740 					    fragsSize, sizeof(TextLineFrag));
4741 	}
4742 	frags[nFrags].init(line, 0, line->len);
4743 	frags[nFrags].computeCoords(gTrue);
4744 	++nFrags;
4745       }
4746     }
4747     qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
4748     i = 0;
4749     while (i < nFrags) {
4750       delta = maxIntraLineDelta * frags[i].line->words->fontSize;
4751       for (j = i+1;
4752 	   j < nFrags && fabs(frags[j].base - frags[i].base) < delta;
4753 	   ++j) ;
4754       qsort(frags + i, j - i, sizeof(TextLineFrag),
4755 	    &TextLineFrag::cmpXYColumnPrimaryRot);
4756       i = j;
4757     }
4758 
4759 #if 0 // for debugging
4760     printf("*** line fragments ***\n");
4761     for (i = 0; i < nFrags; ++i) {
4762       frag = &frags[i];
4763       printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
4764 	     frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base);
4765       for (n = 0; n < frag->len; ++n) {
4766 	fputc(frag->line->text[frag->start + n] & 0xff, stdout);
4767       }
4768       printf("'\n");
4769     }
4770     printf("\n");
4771 #endif
4772 
4773     // generate output
4774     col = 0;
4775     for (i = 0; i < nFrags; ++i) {
4776       frag = &frags[i];
4777 
4778       // column alignment
4779       for (; col < frag->col; ++col) {
4780 	(*outputFunc)(outputStream, space, spaceLen);
4781       }
4782 
4783       // print the line
4784       s = new GooString();
4785       col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
4786       (*outputFunc)(outputStream, s->getCString(), s->getLength());
4787       delete s;
4788 
4789       // print one or more returns if necessary
4790       if (i == nFrags - 1 ||
4791 	  frags[i+1].col < col ||
4792 	  fabs(frags[i+1].base - frag->base) >
4793 	    maxIntraLineDelta * frag->line->words->fontSize) {
4794 	if (i < nFrags - 1) {
4795 	  d = (int)((frags[i+1].base - frag->base) /
4796 		    frag->line->words->fontSize);
4797 	  if (d < 1) {
4798 	    d = 1;
4799 	  } else if (d > 5) {
4800 	    d = 5;
4801 	  }
4802 	} else {
4803 	  d = 1;
4804 	}
4805 	for (; d > 0; --d) {
4806 	  (*outputFunc)(outputStream, eol, eolLen);
4807 	}
4808 	col = 0;
4809       }
4810     }
4811 
4812     gfree(frags);
4813 
4814   // output the page, "undoing" the layout
4815   } else {
4816     for (flow = flows; flow; flow = flow->next) {
4817       for (blk = flow->blocks; blk; blk = blk->next) {
4818 	for (line = blk->lines; line; line = line->next) {
4819 	  n = line->len;
4820 	  if (line->hyphenated && (line->next || blk->next)) {
4821 	    --n;
4822 	  }
4823 	  s = new GooString();
4824 	  dumpFragment(line->text, n, uMap, s);
4825 	  (*outputFunc)(outputStream, s->getCString(), s->getLength());
4826 	  delete s;
4827 	  // output a newline when a hyphen is not suppressed
4828 	  if (n == line->len) {
4829 	    (*outputFunc)(outputStream, eol, eolLen);
4830 	  }
4831 	}
4832       }
4833       (*outputFunc)(outputStream, eol, eolLen);
4834     }
4835   }
4836 
4837   // end of page
4838   if (pageBreaks) {
4839     (*outputFunc)(outputStream, eop, eopLen);
4840   }
4841 
4842   uMap->decRefCnt();
4843 }
4844 
assignColumns(TextLineFrag * frags,int nFrags,GBool oneRot)4845 void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
4846   TextLineFrag *frag0, *frag1;
4847   int rot, col1, col2, i, j, k;
4848 
4849   // all text in the region has the same rotation -- recompute the
4850   // column numbers based only on the text in the region
4851   if (oneRot) {
4852     qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
4853     rot = frags[0].line->rot;
4854     for (i = 0; i < nFrags; ++i) {
4855       frag0 = &frags[i];
4856       col1 = 0;
4857       for (j = 0; j < i; ++j) {
4858 	frag1 = &frags[j];
4859 	col2 = 0; // make gcc happy
4860 	switch (rot) {
4861 	case 0:
4862 	  if (frag0->xMin >= frag1->xMax) {
4863 	    col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
4864 				 frag1->line->col[frag1->start]) + 1;
4865 	  } else {
4866 	    for (k = frag1->start;
4867 		 k < frag1->start + frag1->len &&
4868 		   frag0->xMin >= 0.5 * (frag1->line->edge[k] +
4869 					 frag1->line->edge[k+1]);
4870 		 ++k) ;
4871 	    col2 = frag1->col +
4872 	           frag1->line->col[k] - frag1->line->col[frag1->start];
4873 	  }
4874 	  break;
4875 	case 1:
4876 	  if (frag0->yMin >= frag1->yMax) {
4877 	    col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
4878 				 frag1->line->col[frag1->start]) + 1;
4879 	  } else {
4880 	    for (k = frag1->start;
4881 		 k < frag1->start + frag1->len &&
4882 		   frag0->yMin >= 0.5 * (frag1->line->edge[k] +
4883 					 frag1->line->edge[k+1]);
4884 		 ++k) ;
4885 	    col2 = frag1->col +
4886 	           frag1->line->col[k] - frag1->line->col[frag1->start];
4887 	  }
4888 	  break;
4889 	case 2:
4890 	  if (frag0->xMax <= frag1->xMin) {
4891 	    col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
4892 				 frag1->line->col[frag1->start]) + 1;
4893 	  } else {
4894 	    for (k = frag1->start;
4895 		 k < frag1->start + frag1->len &&
4896 		   frag0->xMax <= 0.5 * (frag1->line->edge[k] +
4897 					 frag1->line->edge[k+1]);
4898 		 ++k) ;
4899 	    col2 = frag1->col +
4900 	           frag1->line->col[k] - frag1->line->col[frag1->start];
4901 	  }
4902 	  break;
4903 	case 3:
4904 	  if (frag0->yMax <= frag1->yMin) {
4905 	    col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
4906 				 frag1->line->col[frag1->start]) + 1;
4907 	  } else {
4908 	    for (k = frag1->start;
4909 		 k < frag1->start + frag1->len &&
4910 		   frag0->yMax <= 0.5 * (frag1->line->edge[k] +
4911 					 frag1->line->edge[k+1]);
4912 		 ++k) ;
4913 	    col2 = frag1->col +
4914 	           frag1->line->col[k] - frag1->line->col[frag1->start];
4915 	  }
4916 	  break;
4917 	}
4918 	if (col2 > col1) {
4919 	  col1 = col2;
4920 	}
4921       }
4922       frag0->col = col1;
4923     }
4924 
4925   // the region includes text at different rotations -- use the
4926   // globally assigned column numbers, offset by the minimum column
4927   // number (i.e., shift everything over to column 0)
4928   } else {
4929     col1 = frags[0].col;
4930     for (i = 1; i < nFrags; ++i) {
4931       if (frags[i].col < col1) {
4932 	col1 = frags[i].col;
4933       }
4934     }
4935     for (i = 0; i < nFrags; ++i) {
4936       frags[i].col -= col1;
4937     }
4938   }
4939 }
4940 
dumpFragment(Unicode * text,int len,UnicodeMap * uMap,GooString * s)4941 int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
4942 			   GooString *s) {
4943   char lre[8], rle[8], popdf[8], buf[8];
4944   int lreLen, rleLen, popdfLen, n;
4945   int nCols, i, j, k;
4946 
4947   nCols = 0;
4948 
4949   if (uMap->isUnicode()) {
4950 
4951     lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
4952     rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
4953     popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
4954 
4955     if (primaryLR) {
4956 
4957       i = 0;
4958       while (i < len) {
4959 	// output a left-to-right section
4960 	for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
4961 	for (k = i; k < j; ++k) {
4962 	  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
4963 	  s->append(buf, n);
4964 	  ++nCols;
4965 	}
4966 	i = j;
4967 	// output a right-to-left section
4968 	for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ;
4969 	if (j > i) {
4970 	  s->append(rle, rleLen);
4971 	  for (k = j - 1; k >= i; --k) {
4972 	    n = uMap->mapUnicode(text[k], buf, sizeof(buf));
4973 	    s->append(buf, n);
4974 	    ++nCols;
4975 	  }
4976 	  s->append(popdf, popdfLen);
4977 	  i = j;
4978 	}
4979       }
4980 
4981     } else {
4982 
4983       s->append(rle, rleLen);
4984       i = len - 1;
4985       while (i >= 0) {
4986 	// output a right-to-left section
4987 	for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ;
4988 	for (k = i; k > j; --k) {
4989 	  n = uMap->mapUnicode(text[k], buf, sizeof(buf));
4990 	  s->append(buf, n);
4991 	  ++nCols;
4992 	}
4993 	i = j;
4994 	// output a left-to-right section
4995 	for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
4996 	if (j < i) {
4997 	  s->append(lre, lreLen);
4998 	  for (k = j + 1; k <= i; ++k) {
4999 	    n = uMap->mapUnicode(text[k], buf, sizeof(buf));
5000 	    s->append(buf, n);
5001 	    ++nCols;
5002 	  }
5003 	  s->append(popdf, popdfLen);
5004 	  i = j;
5005 	}
5006       }
5007       s->append(popdf, popdfLen);
5008 
5009     }
5010 
5011   } else {
5012     for (i = 0; i < len; ++i) {
5013       n = uMap->mapUnicode(text[i], buf, sizeof(buf));
5014       s->append(buf, n);
5015       nCols += n;
5016     }
5017   }
5018 
5019   return nCols;
5020 }
5021 
5022 #if TEXTOUT_WORD_LIST
makeWordList(GBool physLayout)5023 TextWordList *TextPage::makeWordList(GBool physLayout) {
5024   return new TextWordList(this, physLayout);
5025 }
5026 #endif
5027 
5028 //------------------------------------------------------------------------
5029 // ActualText
5030 //------------------------------------------------------------------------
ActualText(TextPage * out)5031 ActualText::ActualText(TextPage *out) {
5032   out->incRefCnt();
5033   text = out;
5034   actualText = NULL;
5035   actualTextBMCLevel = 0;
5036 }
5037 
~ActualText()5038 ActualText::~ActualText() {
5039   if (actualText)
5040     delete actualText;
5041   text->decRefCnt();
5042 }
5043 
addChar(GfxState * state,double x,double y,double dx,double dy,CharCode c,int nBytes,Unicode * u,int uLen)5044 void ActualText::addChar(GfxState *state, double x, double y,
5045 			 double dx, double dy,
5046 			 CharCode c, int nBytes, Unicode *u, int uLen) {
5047   if (actualTextBMCLevel == 0) {
5048     text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5049   } else {
5050     // Inside ActualText span.
5051     if (newActualTextSpan) {
5052       actualText_x = x;
5053       actualText_y = y;
5054       actualText_dx = dx;
5055       actualText_dy = dy;
5056       newActualTextSpan = gFalse;
5057     } else {
5058       if (x < actualText_x)
5059 	actualText_x = x;
5060       if (y < actualText_y)
5061 	actualText_y = y;
5062       if (x + dx > actualText_x + actualText_dx)
5063 	actualText_dx = x + dx - actualText_x;
5064       if (y + dy > actualText_y + actualText_dy)
5065 	actualText_dy = y + dy - actualText_y;
5066     }
5067   }
5068 }
5069 
beginMC(Dict * properties)5070 void ActualText::beginMC(Dict *properties) {
5071   if (actualTextBMCLevel > 0) {
5072     // Already inside a ActualText span.
5073     actualTextBMCLevel++;
5074     return;
5075   }
5076 
5077   Object obj;
5078   if (properties && properties->lookup("ActualText", &obj)) {
5079     if (obj.isString()) {
5080       actualText = obj.getString();
5081       actualTextBMCLevel = 1;
5082       newActualTextSpan = gTrue;
5083     }
5084   }
5085 }
5086 
endMC(GfxState * state)5087 void ActualText::endMC(GfxState *state) {
5088   char *uniString = NULL;
5089   Unicode *uni;
5090   int length, i;
5091 
5092   if (actualTextBMCLevel > 0) {
5093     actualTextBMCLevel--;
5094     if (actualTextBMCLevel == 0) {
5095       // ActualText span closed. Output the span text and the
5096       // extents of all the glyphs inside the span
5097 
5098       if (newActualTextSpan) {
5099 	// No content inside span.
5100 	actualText_x = state->getCurX();
5101 	actualText_y = state->getCurY();
5102 	actualText_dx = 0;
5103 	actualText_dy = 0;
5104       }
5105 
5106       if (!actualText->hasUnicodeMarker()) {
5107 	if (actualText->getLength() > 0) {
5108 	  //non-unicode string -- assume pdfDocEncoding and
5109 	  //try to convert to UTF16BE
5110 	  uniString = pdfDocEncodingToUTF16(actualText, &length);
5111 	} else {
5112 	  length = 0;
5113 	}
5114       } else {
5115 	uniString = actualText->getCString();
5116 	length = actualText->getLength();
5117       }
5118 
5119       if (length < 3)
5120 	length = 0;
5121       else
5122 	length = length/2 - 1;
5123       uni = new Unicode[length];
5124       for (i = 0 ; i < length; i++)
5125 	uni[i] = ((uniString[2 + i*2] & 0xff)<<8)|(uniString[3 + i*2] & 0xff);
5126 
5127       text->addChar(state,
5128 		    actualText_x, actualText_y,
5129 		    actualText_dx, actualText_dy,
5130 		    0, 1, uni, length);
5131 
5132       delete [] uni;
5133       if (!actualText->hasUnicodeMarker())
5134 	delete [] uniString;
5135       delete actualText;
5136       actualText = NULL;
5137     }
5138   }
5139 }
5140 
5141 //------------------------------------------------------------------------
5142 // TextOutputDev
5143 //------------------------------------------------------------------------
5144 
TextOutputDev_outputToFile(void * stream,char * text,int len)5145 static void TextOutputDev_outputToFile(void *stream, char *text, int len) {
5146   fwrite(text, 1, len, (FILE *)stream);
5147 }
5148 
TextOutputDev(char * fileName,GBool physLayoutA,GBool rawOrderA,GBool append)5149 TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
5150 			     GBool rawOrderA, GBool append) {
5151   text = NULL;
5152   physLayout = physLayoutA;
5153   rawOrder = rawOrderA;
5154   doHTML = gFalse;
5155   ok = gTrue;
5156 
5157   // open file
5158   needClose = gFalse;
5159   if (fileName) {
5160     if (!strcmp(fileName, "-")) {
5161       outputStream = stdout;
5162 #ifdef _WIN32
5163       // keep DOS from munging the end-of-line characters
5164       setmode(fileno(stdout), O_BINARY);
5165 #endif
5166     } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
5167       needClose = gTrue;
5168     } else {
5169       error(-1, "Couldn't open text file '%s'", fileName);
5170       ok = gFalse;
5171       actualText = NULL;
5172       return;
5173     }
5174     outputFunc = &TextOutputDev_outputToFile;
5175   } else {
5176     outputStream = NULL;
5177   }
5178 
5179   // set up text object
5180   text = new TextPage(rawOrderA);
5181   actualText = new ActualText(text);
5182 }
5183 
TextOutputDev(TextOutputFunc func,void * stream,GBool physLayoutA,GBool rawOrderA)5184 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
5185 			     GBool physLayoutA, GBool rawOrderA) {
5186   outputFunc = func;
5187   outputStream = stream;
5188   needClose = gFalse;
5189   physLayout = physLayoutA;
5190   rawOrder = rawOrderA;
5191   doHTML = gFalse;
5192   text = new TextPage(rawOrderA);
5193   actualText = new ActualText(text);
5194   ok = gTrue;
5195 }
5196 
~TextOutputDev()5197 TextOutputDev::~TextOutputDev() {
5198   if (needClose) {
5199 #ifdef MACOS
5200     ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
5201 #endif
5202     fclose((FILE *)outputStream);
5203   }
5204   if (text) {
5205     text->decRefCnt();
5206   }
5207   delete actualText;
5208 }
5209 
startPage(int pageNum,GfxState * state)5210 void TextOutputDev::startPage(int pageNum, GfxState *state) {
5211   text->startPage(state);
5212 }
5213 
endPage()5214 void TextOutputDev::endPage() {
5215   text->endPage();
5216   text->coalesce(physLayout, doHTML);
5217   if (outputStream) {
5218     text->dump(outputStream, outputFunc, physLayout);
5219   }
5220 }
5221 
updateFont(GfxState * state)5222 void TextOutputDev::updateFont(GfxState *state) {
5223   text->updateFont(state);
5224 }
5225 
beginString(GfxState * state,GooString * s)5226 void TextOutputDev::beginString(GfxState *state, GooString *s) {
5227 }
5228 
endString(GfxState * state)5229 void TextOutputDev::endString(GfxState *state) {
5230 }
5231 
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode c,int nBytes,Unicode * u,int uLen)5232 void TextOutputDev::drawChar(GfxState *state, double x, double y,
5233 			     double dx, double dy,
5234 			     double originX, double originY,
5235 			     CharCode c, int nBytes, Unicode *u, int uLen) {
5236   actualText->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5237 }
5238 
beginMarkedContent(char * name,Dict * properties)5239 void TextOutputDev::beginMarkedContent(char *name, Dict *properties)
5240 {
5241   actualText->beginMC(properties);
5242 }
5243 
endMarkedContent(GfxState * state)5244 void TextOutputDev::endMarkedContent(GfxState *state)
5245 {
5246   actualText->endMC(state);
5247 }
5248 
stroke(GfxState * state)5249 void TextOutputDev::stroke(GfxState *state) {
5250   GfxPath *path;
5251   GfxSubpath *subpath;
5252   double x[2], y[2];
5253 
5254   if (!doHTML) {
5255     return;
5256   }
5257   path = state->getPath();
5258   if (path->getNumSubpaths() != 1) {
5259     return;
5260   }
5261   subpath = path->getSubpath(0);
5262   if (subpath->getNumPoints() != 2) {
5263     return;
5264   }
5265   state->transform(subpath->getX(0), subpath->getY(0), &x[0], &y[0]);
5266   state->transform(subpath->getX(1), subpath->getY(1), &x[1], &y[1]);
5267 
5268   // look for a vertical or horizontal line
5269   if (x[0] == x[1] || y[0] == y[1]) {
5270     text->addUnderline(x[0], y[0], x[1], y[1]);
5271   }
5272 }
5273 
fill(GfxState * state)5274 void TextOutputDev::fill(GfxState *state) {
5275   GfxPath *path;
5276   GfxSubpath *subpath;
5277   double x[5], y[5];
5278   double rx0, ry0, rx1, ry1, t;
5279   int i;
5280 
5281   if (!doHTML) {
5282     return;
5283   }
5284   path = state->getPath();
5285   if (path->getNumSubpaths() != 1) {
5286     return;
5287   }
5288   subpath = path->getSubpath(0);
5289   if (subpath->getNumPoints() != 5) {
5290     return;
5291   }
5292   for (i = 0; i < 5; ++i) {
5293     if (subpath->getCurve(i)) {
5294       return;
5295     }
5296     state->transform(subpath->getX(i), subpath->getY(i), &x[i], &y[i]);
5297   }
5298 
5299   // look for a rectangle
5300   if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] &&
5301       x[0] == x[4] && y[0] == y[4]) {
5302     rx0 = x[0];
5303     ry0 = y[0];
5304     rx1 = x[2];
5305     ry1 = y[1];
5306   } else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] &&
5307 	     x[0] == x[4] && y[0] == y[4]) {
5308     rx0 = x[0];
5309     ry0 = y[0];
5310     rx1 = x[1];
5311     ry1 = y[2];
5312   } else {
5313     return;
5314   }
5315   if (rx1 < rx0) {
5316     t = rx0;
5317     rx0 = rx1;
5318     rx1 = t;
5319   }
5320   if (ry1 < ry0) {
5321     t = ry0;
5322     ry0 = ry1;
5323     ry1 = t;
5324   }
5325 
5326   // skinny horizontal rectangle
5327   if (ry1 - ry0 < rx1 - rx0) {
5328     if (ry1 - ry0 < maxUnderlineWidth) {
5329       ry0 = 0.5 * (ry0 + ry1);
5330       text->addUnderline(rx0, ry0, rx1, ry0);
5331     }
5332 
5333   // skinny vertical rectangle
5334   } else {
5335     if (rx1 - rx0 < maxUnderlineWidth) {
5336       rx0 = 0.5 * (rx0 + rx1);
5337       text->addUnderline(rx0, ry0, rx0, ry1);
5338     }
5339   }
5340 }
5341 
eoFill(GfxState * state)5342 void TextOutputDev::eoFill(GfxState *state) {
5343   if (!doHTML) {
5344     return;
5345   }
5346   fill(state);
5347 }
5348 
processLink(Link * link,Catalog *)5349 void TextOutputDev::processLink(Link *link, Catalog * /*catalog*/) {
5350   double x1, y1, x2, y2;
5351   int xMin, yMin, xMax, yMax, x, y;
5352 
5353   if (!doHTML) {
5354     return;
5355   }
5356   link->getRect(&x1, &y1, &x2, &y2);
5357   cvtUserToDev(x1, y1, &x, &y);
5358   xMin = xMax = x;
5359   yMin = yMax = y;
5360   cvtUserToDev(x1, y2, &x, &y);
5361   if (x < xMin) {
5362     xMin = x;
5363   } else if (x > xMax) {
5364     xMax = x;
5365   }
5366   if (y < yMin) {
5367     yMin = y;
5368   } else if (y > yMax) {
5369     yMax = y;
5370   }
5371   cvtUserToDev(x2, y1, &x, &y);
5372   if (x < xMin) {
5373     xMin = x;
5374   } else if (x > xMax) {
5375     xMax = x;
5376   }
5377   if (y < yMin) {
5378     yMin = y;
5379   } else if (y > yMax) {
5380     yMax = y;
5381   }
5382   cvtUserToDev(x2, y2, &x, &y);
5383   if (x < xMin) {
5384     xMin = x;
5385   } else if (x > xMax) {
5386     xMax = x;
5387   }
5388   if (y < yMin) {
5389     yMin = y;
5390   } else if (y > yMax) {
5391     yMax = y;
5392   }
5393   text->addLink(xMin, yMin, xMax, yMax, link);
5394 }
5395 
findText(Unicode * s,int len,GBool startAtTop,GBool stopAtBottom,GBool startAtLast,GBool stopAtLast,GBool caseSensitive,GBool backward,double * xMin,double * yMin,double * xMax,double * yMax)5396 GBool TextOutputDev::findText(Unicode *s, int len,
5397 			      GBool startAtTop, GBool stopAtBottom,
5398 			      GBool startAtLast, GBool stopAtLast,
5399 			      GBool caseSensitive, GBool backward,
5400 			      double *xMin, double *yMin,
5401 			      double *xMax, double *yMax) {
5402   return text->findText(s, len, startAtTop, stopAtBottom,
5403 			startAtLast, stopAtLast, caseSensitive, backward,
5404 			xMin, yMin, xMax, yMax);
5405 }
5406 
getText(double xMin,double yMin,double xMax,double yMax)5407 GooString *TextOutputDev::getText(double xMin, double yMin,
5408 				double xMax, double yMax) {
5409   return text->getText(xMin, yMin, xMax, yMax);
5410 }
5411 
drawSelection(OutputDev * out,double scale,int rotation,PDFRectangle * selection,SelectionStyle style,GfxColor * glyph_color,GfxColor * box_color)5412 void TextOutputDev::drawSelection(OutputDev *out,
5413 				  double scale,
5414 				  int rotation,
5415 				  PDFRectangle *selection,
5416 				  SelectionStyle style,
5417 				  GfxColor *glyph_color, GfxColor *box_color) {
5418   text->drawSelection(out, scale, rotation, selection, style, glyph_color, box_color);
5419 }
5420 
getSelectionRegion(PDFRectangle * selection,SelectionStyle style,double scale)5421 GooList *TextOutputDev::getSelectionRegion(PDFRectangle *selection,
5422 					   SelectionStyle style,
5423 					   double scale) {
5424   return text->getSelectionRegion(selection, style, scale);
5425 }
5426 
getSelectionText(PDFRectangle * selection,SelectionStyle style)5427 GooString *TextOutputDev::getSelectionText(PDFRectangle *selection,
5428 					   SelectionStyle style)
5429 {
5430   return text->getSelectionText(selection, style);
5431 }
5432 
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax)5433 GBool TextOutputDev::findCharRange(int pos, int length,
5434 				   double *xMin, double *yMin,
5435 				   double *xMax, double *yMax) {
5436   return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
5437 }
5438 
5439 #if TEXTOUT_WORD_LIST
makeWordList()5440 TextWordList *TextOutputDev::makeWordList() {
5441   return text->makeWordList(physLayout);
5442 }
5443 #endif
5444 
takeText()5445 TextPage *TextOutputDev::takeText() {
5446   TextPage *ret;
5447 
5448   ret = text;
5449   text = new TextPage(rawOrder);
5450   return ret;
5451 }
5452