1 //========================================================================
2 //
3 // TextOutputDev.cc
4 //
5 // Copyright 1997-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
17 // Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru>
18 // Copyright (C) 2006-2008, 2011-2013 Carlos Garcia Campos <carlosgc@gnome.org>
19 // Copyright (C) 2006, 2007, 2013 Ed Catmur <ed@catmur.co.uk>
20 // Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net>
21 // Copyright (C) 2007, 2008, 2012, 2017 Adrian Johnson <ajohnson@redneon.com>
22 // Copyright (C) 2008 Koji Otani <sho@bbr.jp>
23 // Copyright (C) 2008, 2010-2012, 2014-2021 Albert Astals Cid <aacid@kde.org>
24 // Copyright (C) 2008 Pino Toscano <pino@kde.org>
25 // Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl>
26 // Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au>
27 // Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
28 // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
29 // Copyright (C) 2010, 2021 Marek Kasik <mkasik@redhat.com>
30 // Copyright (C) 2010, 2020 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
31 // Copyright (C) 2011 Sam Liao <phyomh@gmail.com>
32 // Copyright (C) 2012 Horst Prote <prote@fmi.uni-stuttgart.de>
33 // Copyright (C) 2012, 2013-2018 Jason Crain <jason@aquaticape.us>
34 // Copyright (C) 2012 Peter Breitenlohner <peb@mppmu.mpg.de>
35 // Copyright (C) 2013 José Aliste <jaliste@src.gnome.org>
36 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
37 // Copyright (C) 2013 Ed Catmur <ed@catmur.co.uk>
38 // Copyright (C) 2016 Khaled Hosny <khaledhosny@eglug.org>
39 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
40 // Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com>
41 // Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
42 // Copyright (C) 2018-2021 Nelson Benítez León <nbenitezl@gmail.com>
43 // Copyright (C) 2019 Christian Persch <chpe@src.gnome.org>
44 // Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de>
45 // Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com>
46 // Copyright (C) 2021 Peter Williams <peter@newton.cx>
47 //
48 // To see a description of the changes please see the Changelog file that
49 // came with your tarball or type make ChangeLog if you are building from git
50 //
51 //========================================================================
52 
53 #include <config.h>
54 
55 #include <cstdio>
56 #include <cstdlib>
57 #include <cstddef>
58 #include <cmath>
59 #include <cfloat>
60 #include <cctype>
61 #include <algorithm>
62 #if defined(_WIN32) || defined(__CYGWIN__)
63 #    include <fcntl.h> // for O_BINARY
64 #    include <io.h> // for _setmode
65 #endif
66 #include "goo/gfile.h"
67 #include "goo/gmem.h"
68 #include "goo/GooString.h"
69 #include "poppler-config.h"
70 #include "Error.h"
71 #include "GlobalParams.h"
72 #include "UnicodeMap.h"
73 #include "UnicodeTypeTable.h"
74 #include "Link.h"
75 #include "TextOutputDev.h"
76 #include "Page.h"
77 #include "Annot.h"
78 #include "UTF.h"
79 
80 //------------------------------------------------------------------------
81 // parameters
82 //------------------------------------------------------------------------
83 
84 // Each bucket in a text pool includes baselines within a range of
85 // this many points.
86 #define textPoolStep 4
87 
88 // Inter-character space width which will cause addChar to start a new
89 // word.
90 #define minWordBreakSpace 0.1
91 
92 // Negative inter-character space width, i.e., overlap, which will
93 // cause addChar to start a new word.
94 #define minDupBreakOverlap 0.2
95 
96 // Max distance between baselines of two lines within a block, as a
97 // fraction of the font size.
98 #define maxLineSpacingDelta 1.5
99 
100 // Max difference in primary font sizes on two lines in the same
101 // block.  Delta1 is used when examining new lines above and below the
102 // current block; delta2 is used when examining text that overlaps the
103 // current block; delta3 is used when examining text to the left and
104 // right of the current block.
105 #define maxBlockFontSizeDelta1 0.05
106 #define maxBlockFontSizeDelta2 0.6
107 #define maxBlockFontSizeDelta3 0.2
108 
109 // Max difference in font sizes inside a word.
110 #define maxWordFontSizeDelta 0.05
111 
112 // Maximum distance between baselines of two words on the same line,
113 // e.g., distance between subscript or superscript and the primary
114 // baseline, as a fraction of the font size.
115 #define maxIntraLineDelta 0.5
116 
117 // Minimum inter-word spacing, as a fraction of the font size.  (Only
118 // used for raw ordering.)
119 #define minWordSpacing 0.15
120 
121 // Maximum inter-word spacing, as a fraction of the font size.
122 #define maxWordSpacing 1.5
123 
124 // Maximum horizontal spacing which will allow a word to be pulled
125 // into a block.
126 #define minColSpacing1 0.3
127 
128 // Minimum spacing between columns, as a fraction of the font size.
129 #define minColSpacing2 1.0
130 
131 // Maximum vertical spacing between blocks within a flow, as a
132 // multiple of the font size.
133 #define maxBlockSpacing 2.5
134 
135 // Minimum spacing between characters within a word, as a fraction of
136 // the font size.
137 #define minCharSpacing -0.5
138 
139 // Maximum spacing between characters within a word, as a fraction of
140 // the font size, when there is no obvious extra-wide character
141 // spacing.
142 #define maxCharSpacing 0.03
143 
144 // When extra-wide character spacing is detected, the inter-character
145 // space threshold is set to the minimum inter-character space
146 // multiplied by this constant.
147 #define maxWideCharSpacingMul 1.3
148 
149 // Upper limit on spacing between characters in a word.
150 #define maxWideCharSpacing 0.4
151 
152 // Max difference in primary,secondary coordinates (as a fraction of
153 // the font size) allowed for duplicated text (fake boldface, drop
154 // shadows) which is to be discarded.
155 #define dupMaxPriDelta 0.1
156 #define dupMaxSecDelta 0.2
157 
158 // Max width of underlines (in points).
159 #define maxUnderlineWidth 3
160 
161 // Min distance between baseline and underline (in points).
162 //~ this should be font-size-dependent
163 #define minUnderlineGap -2
164 
165 // Max distance between baseline and underline (in points).
166 //~ this should be font-size-dependent
167 #define maxUnderlineGap 4
168 
169 // Max horizontal distance between edge of word and start of underline
170 // (in points).
171 //~ this should be font-size-dependent
172 #define underlineSlack 1
173 
174 // Max distance between edge of text and edge of link border
175 #define hyperlinkSlack 2
176 
177 // Max distance between characters when combining a base character and
178 // combining character
179 #define combMaxMidDelta 0.3
180 #define combMaxBaseDelta 0.4
181 
182 // Text is considered diagonal if abs(tan(angle)) > diagonalThreshold.
183 // (Or 1/tan(angle) for 90/270 degrees.)
184 #define diagonalThreshold 0.1
185 
186 // How opaque a selection on a glyphless font should be. Since the font is
187 // glyphless and overlaid over text in image form, this must enable users
188 // to read the underlying image. Issue #157
189 #define glyphlessSelectionOpacity 0.4
190 
191 // Returns whether x is between a and b or equal to a or b.
192 // a and b don't need to be sorted.
193 #define XBetweenAB(x, a, b) (!(((x) > (a) && (x) > (b)) || ((x) < (a) && (x) < (b))) ? true : false)
194 
195 namespace {
196 
isAscii7(Unicode uchar)197 inline bool isAscii7(Unicode uchar)
198 {
199     return uchar < 128;
200 }
201 
202 }
203 
reorderText(const Unicode * text,int len,const UnicodeMap * uMap,bool primaryLR,GooString * s,Unicode * u)204 static int reorderText(const Unicode *text, int len, const UnicodeMap *uMap, bool primaryLR, GooString *s, Unicode *u)
205 {
206     char lre[8], rle[8], popdf[8], buf[8];
207     int lreLen = 0, rleLen = 0, popdfLen = 0, n;
208     int nCols, i, j, k;
209 
210     nCols = 0;
211 
212     if (s) {
213         lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
214         rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
215         popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
216     }
217 
218     if (primaryLR) {
219         i = 0;
220         while (i < len) {
221             // output a left-to-right section
222             for (j = i; j < len && !unicodeTypeR(text[j]); ++j)
223                 ;
224             for (k = i; k < j; ++k) {
225                 if (s) {
226                     n = uMap->mapUnicode(text[k], buf, sizeof(buf));
227                     s->append(buf, n);
228                 }
229                 if (u)
230                     u[nCols] = text[k];
231                 ++nCols;
232             }
233             i = j;
234             // output a right-to-left section
235             for (j = i; j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); ++j)
236                 ;
237             if (j > i) {
238                 if (s)
239                     s->append(rle, rleLen);
240                 for (k = j - 1; k >= i; --k) {
241                     if (s) {
242                         n = uMap->mapUnicode(text[k], buf, sizeof(buf));
243                         s->append(buf, n);
244                     }
245                     if (u)
246                         u[nCols] = text[k];
247                     ++nCols;
248                 }
249                 if (s)
250                     s->append(popdf, popdfLen);
251                 i = j;
252             }
253         }
254     } else {
255         // Note: This code treats numeric characters (European and
256         // Arabic/Indic) as left-to-right, which isn't strictly correct
257         // (incurs extra LRE/POPDF pairs), but does produce correct
258         // visual formatting.
259         if (s)
260             s->append(rle, rleLen);
261         i = len - 1;
262         while (i >= 0) {
263             // output a right-to-left section
264             for (j = i; j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); --j)
265                 ;
266             for (k = i; k > j; --k) {
267                 if (s) {
268                     n = uMap->mapUnicode(text[k], buf, sizeof(buf));
269                     s->append(buf, n);
270                 }
271                 if (u)
272                     u[nCols] = text[k];
273                 ++nCols;
274             }
275             i = j;
276             // output a left-to-right section
277             for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j)
278                 ;
279             if (j < i) {
280                 if (s)
281                     s->append(lre, lreLen);
282                 for (k = j + 1; k <= i; ++k) {
283                     if (s) {
284                         n = uMap->mapUnicode(text[k], buf, sizeof(buf));
285                         s->append(buf, n);
286                     }
287                     if (u)
288                         u[nCols] = text[k];
289                     ++nCols;
290                 }
291                 if (s)
292                     s->append(popdf, popdfLen);
293                 i = j;
294             }
295         }
296         if (s)
297             s->append(popdf, popdfLen);
298     }
299 
300     return nCols;
301 }
302 
303 //------------------------------------------------------------------------
304 // TextUnderline
305 //------------------------------------------------------------------------
306 
307 class TextUnderline
308 {
309 public:
TextUnderline(double x0A,double y0A,double x1A,double y1A)310     TextUnderline(double x0A, double y0A, double x1A, double y1A)
311     {
312         x0 = x0A;
313         y0 = y0A;
314         x1 = x1A;
315         y1 = y1A;
316         horiz = y0 == y1;
317     }
~TextUnderline()318     ~TextUnderline() { }
319 
320     double x0, y0, x1, y1;
321     bool horiz;
322 };
323 
324 //------------------------------------------------------------------------
325 // TextLink
326 //------------------------------------------------------------------------
327 
328 class TextLink
329 {
330 public:
TextLink(int xMinA,int yMinA,int xMaxA,int yMaxA,AnnotLink * linkA)331     TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, AnnotLink *linkA)
332     {
333         xMin = xMinA;
334         yMin = yMinA;
335         xMax = xMaxA;
336         yMax = yMaxA;
337         link = linkA;
338     }
~TextLink()339     ~TextLink() { }
340 
341     int xMin, yMin, xMax, yMax;
342     AnnotLink *link;
343 };
344 
345 //------------------------------------------------------------------------
346 // TextFontInfo
347 //------------------------------------------------------------------------
348 
TextFontInfo(const GfxState * state)349 TextFontInfo::TextFontInfo(const GfxState *state)
350 {
351     gfxFont = state->getFont();
352     if (gfxFont)
353         gfxFont->incRefCnt();
354 #ifdef TEXTOUT_WORD_LIST
355     fontName = (gfxFont && gfxFont->getName()) ? gfxFont->getName()->copy() : nullptr;
356     flags = gfxFont ? gfxFont->getFlags() : 0;
357 #endif
358 }
359 
~TextFontInfo()360 TextFontInfo::~TextFontInfo()
361 {
362     if (gfxFont)
363         gfxFont->decRefCnt();
364 #ifdef TEXTOUT_WORD_LIST
365     if (fontName) {
366         delete fontName;
367     }
368 #endif
369 }
370 
matches(const GfxState * state) const371 bool TextFontInfo::matches(const GfxState *state) const
372 {
373     return state->getFont() == gfxFont;
374 }
375 
matches(const TextFontInfo * fontInfo) const376 bool TextFontInfo::matches(const TextFontInfo *fontInfo) const
377 {
378     return gfxFont == fontInfo->gfxFont;
379 }
380 
matches(const Ref * ref) const381 bool TextFontInfo::matches(const Ref *ref) const
382 {
383     return (*(gfxFont->getID()) == *ref);
384 }
385 
getAscent() const386 double TextFontInfo::getAscent() const
387 {
388     return gfxFont ? gfxFont->getAscent() : 0.95;
389 }
390 
getDescent() const391 double TextFontInfo::getDescent() const
392 {
393     return gfxFont ? gfxFont->getDescent() : -0.35;
394 }
395 
getWMode() const396 int TextFontInfo::getWMode() const
397 {
398     return gfxFont ? gfxFont->getWMode() : 0;
399 }
400 
401 //------------------------------------------------------------------------
402 // TextWord
403 //------------------------------------------------------------------------
404 
TextWord(const GfxState * state,int rotA,double fontSizeA)405 TextWord::TextWord(const GfxState *state, int rotA, double fontSizeA)
406 {
407     rot = rotA;
408     fontSize = fontSizeA;
409     text = nullptr;
410     charcode = nullptr;
411     edge = nullptr;
412     charPos = nullptr;
413     font = nullptr;
414     textMat = nullptr;
415     len = size = 0;
416     spaceAfter = false;
417     next = nullptr;
418     invisible = state->getRender() == 3;
419 
420 #ifdef TEXTOUT_WORD_LIST
421     GfxRGB rgb;
422 
423     if ((state->getRender() & 3) == 1) {
424         state->getStrokeRGB(&rgb);
425     } else {
426         state->getFillRGB(&rgb);
427     }
428     colorR = colToDbl(rgb.r);
429     colorG = colToDbl(rgb.g);
430     colorB = colToDbl(rgb.b);
431 #endif
432 
433     underlined = false;
434     link = nullptr;
435 }
436 
~TextWord()437 TextWord::~TextWord()
438 {
439     gfree(text);
440     gfree(charcode);
441     gfree(edge);
442     gfree(charPos);
443     gfree(font);
444     gfree(textMat);
445 }
446 
addChar(const GfxState * state,TextFontInfo * fontA,double x,double y,double dx,double dy,int charPosA,int charLen,CharCode c,Unicode u,const Matrix & textMatA)447 void TextWord::addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
448 {
449     ensureCapacity(len + 1);
450     text[len] = u;
451     charcode[len] = c;
452     charPos[len] = charPosA;
453     charPos[len + 1] = charPosA + charLen;
454     font[len] = fontA;
455     textMat[len] = textMatA;
456 
457     if (len == 0)
458         setInitialBounds(fontA, x, y);
459 
460     if (wMode) { // vertical writing mode
461         // NB: the rotation value has been incremented by 1 (in
462         // TextPage::beginWord()) for vertical writing mode
463         switch (rot) {
464         case 0:
465             edge[len] = x - fontSize;
466             xMax = edge[len + 1] = x;
467             break;
468         case 1:
469             edge[len] = y - fontSize;
470             yMax = edge[len + 1] = y;
471             break;
472         case 2:
473             edge[len] = x + fontSize;
474             xMin = edge[len + 1] = x;
475             break;
476         case 3:
477             edge[len] = y + fontSize;
478             yMin = edge[len + 1] = y;
479             break;
480         }
481     } else { // horizontal writing mode
482         switch (rot) {
483         case 0:
484             edge[len] = x;
485             xMax = edge[len + 1] = x + dx;
486             break;
487         case 1:
488             edge[len] = y;
489             yMax = edge[len + 1] = y + dy;
490             break;
491         case 2:
492             edge[len] = x;
493             xMin = edge[len + 1] = x + dx;
494             break;
495         case 3:
496             edge[len] = y;
497             yMin = edge[len + 1] = y + dy;
498             break;
499         }
500     }
501     ++len;
502 }
503 
setInitialBounds(TextFontInfo * fontA,double x,double y)504 void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y)
505 {
506     double ascent = fontA->getAscent() * fontSize;
507     double descent = fontA->getDescent() * fontSize;
508     wMode = fontA->getWMode();
509 
510     if (wMode) { // vertical writing mode
511         // NB: the rotation value has been incremented by 1 (in
512         // TextPage::beginWord()) for vertical writing mode
513         switch (rot) {
514         case 0:
515             xMin = x - fontSize;
516             yMin = y - fontSize;
517             yMax = y;
518             base = y;
519             break;
520         case 1:
521             xMin = x;
522             yMin = y - fontSize;
523             xMax = x + fontSize;
524             base = x;
525             break;
526         case 2:
527             yMin = y;
528             xMax = x + fontSize;
529             yMax = y + fontSize;
530             base = y;
531             break;
532         case 3:
533             xMin = x - fontSize;
534             xMax = x;
535             yMax = y + fontSize;
536             base = x;
537             break;
538         }
539     } else { // horizontal writing mode
540         switch (rot) {
541         case 0:
542             xMin = x;
543             yMin = y - ascent;
544             yMax = y - descent;
545             if (yMin == yMax) {
546                 // this is a sanity check for a case that shouldn't happen -- but
547                 // if it does happen, we want to avoid dividing by zero later
548                 yMin = y;
549                 yMax = y + 1;
550             }
551             base = y;
552             break;
553         case 1:
554             xMin = x + descent;
555             yMin = y;
556             xMax = x + ascent;
557             if (xMin == xMax) {
558                 // this is a sanity check for a case that shouldn't happen -- but
559                 // if it does happen, we want to avoid dividing by zero later
560                 xMin = x;
561                 xMax = x + 1;
562             }
563             base = x;
564             break;
565         case 2:
566             yMin = y + descent;
567             xMax = x;
568             yMax = y + ascent;
569             if (yMin == yMax) {
570                 // this is a sanity check for a case that shouldn't happen -- but
571                 // if it does happen, we want to avoid dividing by zero later
572                 yMin = y;
573                 yMax = y + 1;
574             }
575             base = y;
576             break;
577         case 3:
578             xMin = x - ascent;
579             xMax = x - descent;
580             yMax = y;
581             if (xMin == xMax) {
582                 // this is a sanity check for a case that shouldn't happen -- but
583                 // if it does happen, we want to avoid dividing by zero later
584                 xMin = x;
585                 xMax = x + 1;
586             }
587             base = x;
588             break;
589         }
590     }
591 }
592 
ensureCapacity(int capacity)593 void TextWord::ensureCapacity(int capacity)
594 {
595     if (capacity > size) {
596         size = std::max(size + 16, capacity);
597         text = (Unicode *)greallocn(text, size, sizeof(Unicode));
598         charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode));
599         edge = (double *)greallocn(edge, (size + 1), sizeof(double));
600         charPos = (int *)greallocn(charPos, size + 1, sizeof(int));
601         font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *));
602         textMat = (Matrix *)greallocn(textMat, size, sizeof(Matrix));
603     }
604 }
605 
606 struct CombiningTable
607 {
608     Unicode base;
609     Unicode comb;
610 };
611 
612 static const struct CombiningTable combiningTable[] = {
613     { 0x0060, 0x0300 }, // grave
614     { 0x00a8, 0x0308 }, // dieresis
615     { 0x00af, 0x0304 }, // macron
616     { 0x00b4, 0x0301 }, // acute
617     { 0x00b8, 0x0327 }, // cedilla
618     { 0x02c6, 0x0302 }, // circumflex
619     { 0x02c7, 0x030c }, // caron
620     { 0x02d8, 0x0306 }, // breve
621     { 0x02d9, 0x0307 }, // dotaccent
622     { 0x02da, 0x030a }, // ring
623     { 0x02dc, 0x0303 }, // tilde
624     { 0x02dd, 0x030b } // hungarumlaut (double acute accent)
625 };
626 
627 // returning combining versions of characters
getCombiningChar(Unicode u)628 static Unicode getCombiningChar(Unicode u)
629 {
630     for (const CombiningTable &combining : combiningTable) {
631         if (u == combining.base)
632             return combining.comb;
633     }
634     return 0;
635 }
636 
addCombining(const GfxState * state,TextFontInfo * fontA,double fontSizeA,double x,double y,double dx,double dy,int charPosA,int charLen,CharCode c,Unicode u,const Matrix & textMatA)637 bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
638 {
639     if (len == 0 || wMode != 0 || fontA->getWMode() != 0)
640         return false;
641 
642     Unicode cCurrent = getCombiningChar(u);
643     Unicode cPrev = getCombiningChar(text[len - 1]);
644     double edgeMid = (edge[len - 1] + edge[len]) / 2;
645     double charMid, maxScaledMidDelta, charBase, maxScaledBaseDelta;
646 
647     if (cCurrent != 0 && unicodeTypeAlphaNum(text[len - 1])) {
648         // Current is a combining character, previous is base character
649         maxScaledMidDelta = fabs(edge[len] - edge[len - 1]) * combMaxMidDelta;
650         charMid = charBase = maxScaledBaseDelta = 0;
651 
652         // Test if characters overlap
653         if (rot == 0 || rot == 2) {
654             charMid = x + (dx / 2);
655             charBase = y;
656             maxScaledBaseDelta = (yMax - yMin) * combMaxBaseDelta;
657         } else {
658             charMid = y + (dy / 2);
659             charBase = x;
660             maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta;
661         }
662 
663         if (fabs(charMid - edgeMid) >= maxScaledMidDelta || fabs(charBase - base) >= maxScaledBaseDelta)
664             return false;
665 
666         // Add character, but don't adjust edge / bounding box because
667         // combining character's positioning could be odd.
668         ensureCapacity(len + 1);
669         text[len] = cCurrent;
670         charcode[len] = c;
671         charPos[len] = charPosA;
672         charPos[len + 1] = charPosA + charLen;
673         font[len] = fontA;
674         textMat[len] = textMatA;
675         edge[len + 1] = edge[len];
676         edge[len] = (edge[len + 1] + edge[len - 1]) / 2;
677         ++len;
678         return true;
679     }
680 
681     if (cPrev != 0 && unicodeTypeAlphaNum(u)) {
682         // Previous is a combining character, current is base character
683         maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta;
684         charMid = charBase = maxScaledMidDelta = 0;
685 
686         // Test if characters overlap
687         if (rot == 0 || rot == 2) {
688             charMid = x + (dx / 2);
689             charBase = y;
690             maxScaledMidDelta = fabs(dx * combMaxMidDelta);
691         } else {
692             charMid = y + (dy / 2);
693             charBase = x;
694             maxScaledMidDelta = fabs(dy * combMaxMidDelta);
695         }
696 
697         if (fabs(charMid - edgeMid) >= maxScaledMidDelta || fabs(charBase - base) >= maxScaledBaseDelta)
698             return false;
699 
700         // move combining character to after base character
701         ensureCapacity(len + 1);
702         fontSize = fontSizeA;
703         text[len] = cPrev;
704         charcode[len] = charcode[len - 1];
705         charPos[len] = charPosA;
706         charPos[len + 1] = charPosA + charLen;
707         font[len] = font[len - 1];
708         textMat[len] = textMat[len - 1];
709 
710         text[len - 1] = u;
711         charcode[len - 1] = c;
712         font[len - 1] = fontA;
713         textMat[len - 1] = textMatA;
714 
715         if (len == 1)
716             setInitialBounds(fontA, x, y);
717 
718         // Updated edges / bounding box because we changed the base
719         // character.
720         if (wMode) {
721             switch (rot) {
722             case 0:
723                 edge[len - 1] = x - fontSize;
724                 xMax = edge[len + 1] = x;
725                 break;
726             case 1:
727                 edge[len - 1] = y - fontSize;
728                 yMax = edge[len + 1] = y;
729                 break;
730             case 2:
731                 edge[len - 1] = x + fontSize;
732                 xMin = edge[len + 1] = x;
733                 break;
734             case 3:
735                 edge[len - 1] = y + fontSize;
736                 yMin = edge[len + 1] = y;
737                 break;
738             }
739         } else {
740             switch (rot) {
741             case 0:
742                 edge[len - 1] = x;
743                 xMax = edge[len + 1] = x + dx;
744                 break;
745             case 1:
746                 edge[len - 1] = y;
747                 yMax = edge[len + 1] = y + dy;
748                 break;
749             case 2:
750                 edge[len - 1] = x;
751                 xMin = edge[len + 1] = x + dx;
752                 break;
753             case 3:
754                 edge[len - 1] = y;
755                 yMin = edge[len + 1] = y + dy;
756                 break;
757             }
758         }
759 
760         edge[len] = (edge[len + 1] + edge[len - 1]) / 2;
761         ++len;
762         return true;
763     }
764     return false;
765 }
766 
merge(TextWord * word)767 void TextWord::merge(TextWord *word)
768 {
769     int i;
770 
771     if (word->xMin < xMin) {
772         xMin = word->xMin;
773     }
774     if (word->yMin < yMin) {
775         yMin = word->yMin;
776     }
777     if (word->xMax > xMax) {
778         xMax = word->xMax;
779     }
780     if (word->yMax > yMax) {
781         yMax = word->yMax;
782     }
783     ensureCapacity(len + word->len);
784     for (i = 0; i < word->len; ++i) {
785         text[len + i] = word->text[i];
786         charcode[len + i] = word->charcode[i];
787         edge[len + i] = word->edge[i];
788         charPos[len + i] = word->charPos[i];
789         font[len + i] = word->font[i];
790         textMat[len + i] = word->textMat[i];
791     }
792     edge[len + word->len] = word->edge[word->len];
793     charPos[len + word->len] = word->charPos[word->len];
794     len += word->len;
795 }
796 
primaryCmp(const TextWord * word) const797 inline int TextWord::primaryCmp(const TextWord *word) const
798 {
799     double cmp;
800 
801     cmp = 0; // make gcc happy
802     switch (rot) {
803     case 0:
804         cmp = xMin - word->xMin;
805         break;
806     case 1:
807         cmp = yMin - word->yMin;
808         break;
809     case 2:
810         cmp = word->xMax - xMax;
811         break;
812     case 3:
813         cmp = word->yMax - yMax;
814         break;
815     }
816     return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
817 }
818 
primaryDelta(const TextWord * word) const819 double TextWord::primaryDelta(const TextWord *word) const
820 {
821     double delta;
822 
823     delta = 0; // make gcc happy
824     switch (rot) {
825     case 0:
826         delta = word->xMin - xMax;
827         break;
828     case 1:
829         delta = word->yMin - yMax;
830         break;
831     case 2:
832         delta = xMin - word->xMax;
833         break;
834     case 3:
835         delta = yMin - word->yMax;
836         break;
837     }
838     return delta;
839 }
840 
cmpYX(const void * p1,const void * p2)841 int TextWord::cmpYX(const void *p1, const void *p2)
842 {
843     TextWord *word1 = *(TextWord **)p1;
844     TextWord *word2 = *(TextWord **)p2;
845     double cmp;
846 
847     cmp = word1->yMin - word2->yMin;
848     if (cmp == 0) {
849         cmp = word1->xMin - word2->xMin;
850     }
851     return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
852 }
853 
854 #ifdef TEXTOUT_WORD_LIST
855 
getText() const856 GooString *TextWord::getText() const
857 {
858     GooString *s;
859     const UnicodeMap *uMap;
860     char buf[8];
861     int n, i;
862 
863     s = new GooString();
864     if (!(uMap = globalParams->getTextEncoding())) {
865         return s;
866     }
867     for (i = 0; i < len; ++i) {
868         n = uMap->mapUnicode(text[i], buf, sizeof(buf));
869         s->append(buf, n);
870     }
871     return s;
872 }
873 
getCharBBox(int charIdx,double * xMinA,double * yMinA,double * xMaxA,double * yMaxA) const874 void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const
875 {
876     if (charIdx < 0 || charIdx >= len) {
877         return;
878     }
879     switch (rot) {
880     case 0:
881         *xMinA = edge[charIdx];
882         *xMaxA = edge[charIdx + 1];
883         *yMinA = yMin;
884         *yMaxA = yMax;
885         break;
886     case 1:
887         *xMinA = xMin;
888         *xMaxA = xMax;
889         *yMinA = edge[charIdx];
890         *yMaxA = edge[charIdx + 1];
891         break;
892     case 2:
893         *xMinA = edge[charIdx + 1];
894         *xMaxA = edge[charIdx];
895         *yMinA = yMin;
896         *yMaxA = yMax;
897         break;
898     case 3:
899         *xMinA = xMin;
900         *xMaxA = xMax;
901         *yMinA = edge[charIdx + 1];
902         *yMaxA = edge[charIdx];
903         break;
904     }
905 }
906 
907 #endif // TEXTOUT_WORD_LIST
908 
909 //------------------------------------------------------------------------
910 // TextPool
911 //------------------------------------------------------------------------
912 
TextPool()913 TextPool::TextPool()
914 {
915     minBaseIdx = 0;
916     maxBaseIdx = -1;
917     pool = nullptr;
918     cursor = nullptr;
919     cursorBaseIdx = -1;
920 }
921 
~TextPool()922 TextPool::~TextPool()
923 {
924     int baseIdx;
925     TextWord *word, *word2;
926 
927     for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
928         for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
929             word2 = word->next;
930             delete word;
931         }
932     }
933     gfree(pool);
934 }
935 
getBaseIdx(double base) const936 int TextPool::getBaseIdx(double base) const
937 {
938     const double baseIdxDouble = base / textPoolStep;
939     if (std::isnan(baseIdxDouble) || baseIdxDouble < minBaseIdx) {
940         return minBaseIdx;
941     }
942     if (baseIdxDouble > maxBaseIdx) {
943         return maxBaseIdx;
944     }
945     return (int)baseIdxDouble;
946 }
947 
addWord(TextWord * word)948 void TextPool::addWord(TextWord *word)
949 {
950     int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
951     TextWord *w0, *w1;
952 
953     // expand the array if needed
954     wordBaseIdx = (int)(word->base / textPoolStep);
955     if (unlikely(wordBaseIdx <= INT_MIN + 128 || wordBaseIdx >= INT_MAX - 128)) {
956         error(errSyntaxWarning, -1, "wordBaseIdx out of range");
957         delete word;
958         return;
959     }
960     if (minBaseIdx > maxBaseIdx) {
961         minBaseIdx = wordBaseIdx - 128;
962         maxBaseIdx = wordBaseIdx + 128;
963         pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1, sizeof(TextWord *));
964         for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
965             pool[baseIdx - minBaseIdx] = nullptr;
966         }
967     } else if (wordBaseIdx < minBaseIdx) {
968         newMinBaseIdx = wordBaseIdx - 128;
969         TextWord **newPool = (TextWord **)gmallocn_checkoverflow(maxBaseIdx - newMinBaseIdx + 1, sizeof(TextWord *));
970         if (unlikely(!newPool)) {
971             error(errSyntaxWarning, -1, "newPool would overflow");
972             delete word;
973             return;
974         }
975         for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
976             newPool[baseIdx - newMinBaseIdx] = nullptr;
977         }
978         memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool, (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
979         gfree(pool);
980         pool = newPool;
981         minBaseIdx = newMinBaseIdx;
982     } else if (wordBaseIdx > maxBaseIdx) {
983         newMaxBaseIdx = wordBaseIdx + 128;
984         TextWord **reallocatedPool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1, sizeof(TextWord *), true /*checkoverflow*/, false /*free_pool*/);
985         if (!reallocatedPool) {
986             error(errSyntaxWarning, -1, "new pool size would overflow");
987             delete word;
988             return;
989         }
990         pool = reallocatedPool;
991         for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
992             pool[baseIdx - minBaseIdx] = nullptr;
993         }
994         maxBaseIdx = newMaxBaseIdx;
995     }
996 
997     // insert the new word
998     if (cursor && wordBaseIdx == cursorBaseIdx && word->primaryCmp(cursor) >= 0) {
999         w0 = cursor;
1000         w1 = cursor->next;
1001     } else {
1002         w0 = nullptr;
1003         w1 = pool[wordBaseIdx - minBaseIdx];
1004     }
1005     for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next)
1006         ;
1007     word->next = w1;
1008     if (w0) {
1009         w0->next = word;
1010     } else {
1011         pool[wordBaseIdx - minBaseIdx] = word;
1012     }
1013     cursor = word;
1014     cursorBaseIdx = wordBaseIdx;
1015 }
1016 
1017 //------------------------------------------------------------------------
1018 // TextLine
1019 //------------------------------------------------------------------------
1020 
TextLine(TextBlock * blkA,int rotA,double baseA)1021 TextLine::TextLine(TextBlock *blkA, int rotA, double baseA)
1022 {
1023     blk = blkA;
1024     rot = rotA;
1025     base = baseA;
1026     words = lastWord = nullptr;
1027     text = nullptr;
1028     edge = nullptr;
1029     col = nullptr;
1030     len = 0;
1031     convertedLen = 0;
1032     hyphenated = false;
1033     next = nullptr;
1034     xMin = yMin = 0;
1035     xMax = yMax = -1;
1036     normalized = nullptr;
1037     normalized_len = 0;
1038     normalized_idx = nullptr;
1039     ascii_translation = nullptr;
1040     ascii_len = 0;
1041     ascii_idx = nullptr;
1042 }
1043 
~TextLine()1044 TextLine::~TextLine()
1045 {
1046     TextWord *word;
1047 
1048     while (words) {
1049         word = words;
1050         words = words->next;
1051         delete word;
1052     }
1053     gfree(text);
1054     gfree(edge);
1055     gfree(col);
1056     if (normalized) {
1057         gfree(normalized);
1058         gfree(normalized_idx);
1059     }
1060     if (ascii_translation) {
1061         gfree(ascii_translation);
1062         gfree(ascii_idx);
1063     }
1064 }
1065 
addWord(TextWord * word)1066 void TextLine::addWord(TextWord *word)
1067 {
1068     if (lastWord) {
1069         lastWord->next = word;
1070     } else {
1071         words = word;
1072     }
1073     lastWord = word;
1074 
1075     if (xMin > xMax) {
1076         xMin = word->xMin;
1077         xMax = word->xMax;
1078         yMin = word->yMin;
1079         yMax = word->yMax;
1080     } else {
1081         if (word->xMin < xMin) {
1082             xMin = word->xMin;
1083         }
1084         if (word->xMax > xMax) {
1085             xMax = word->xMax;
1086         }
1087         if (word->yMin < yMin) {
1088             yMin = word->yMin;
1089         }
1090         if (word->yMax > yMax) {
1091             yMax = word->yMax;
1092         }
1093     }
1094 }
1095 
primaryDelta(const TextLine * line) const1096 double TextLine::primaryDelta(const TextLine *line) const
1097 {
1098     double delta;
1099 
1100     delta = 0; // make gcc happy
1101     switch (rot) {
1102     case 0:
1103         delta = line->xMin - xMax;
1104         break;
1105     case 1:
1106         delta = line->yMin - yMax;
1107         break;
1108     case 2:
1109         delta = xMin - line->xMax;
1110         break;
1111     case 3:
1112         delta = yMin - line->yMax;
1113         break;
1114     }
1115     return delta;
1116 }
1117 
primaryCmp(const TextLine * line) const1118 int TextLine::primaryCmp(const TextLine *line) const
1119 {
1120     double cmp;
1121 
1122     cmp = 0; // make gcc happy
1123     switch (rot) {
1124     case 0:
1125         cmp = xMin - line->xMin;
1126         break;
1127     case 1:
1128         cmp = yMin - line->yMin;
1129         break;
1130     case 2:
1131         cmp = line->xMax - xMax;
1132         break;
1133     case 3:
1134         cmp = line->yMax - yMax;
1135         break;
1136     }
1137     return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1138 }
1139 
secondaryCmp(const TextLine * line) const1140 int TextLine::secondaryCmp(const TextLine *line) const
1141 {
1142     double cmp;
1143 
1144     cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
1145     return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1146 }
1147 
cmpYX(const TextLine * line) const1148 int TextLine::cmpYX(const TextLine *line) const
1149 {
1150     int cmp;
1151 
1152     if ((cmp = secondaryCmp(line))) {
1153         return cmp;
1154     }
1155     return primaryCmp(line);
1156 }
1157 
cmpXY(const void * p1,const void * p2)1158 int TextLine::cmpXY(const void *p1, const void *p2)
1159 {
1160     TextLine *line1 = *(TextLine **)p1;
1161     TextLine *line2 = *(TextLine **)p2;
1162     int cmp;
1163 
1164     if ((cmp = line1->primaryCmp(line2))) {
1165         return cmp;
1166     }
1167     return line1->secondaryCmp(line2);
1168 }
1169 
coalesce(const UnicodeMap * uMap)1170 void TextLine::coalesce(const UnicodeMap *uMap)
1171 {
1172     TextWord *word0, *word1;
1173     double space, delta, minSpace;
1174     bool isUnicode;
1175     char buf[8];
1176     int i, j;
1177 
1178     if (words->next) {
1179 
1180         // compute the inter-word space threshold
1181         if (words->len > 1 || words->next->len > 1) {
1182             minSpace = 0;
1183         } else {
1184             minSpace = words->primaryDelta(words->next);
1185             for (word0 = words->next, word1 = word0->next; word1 && minSpace > 0; word0 = word1, word1 = word0->next) {
1186                 if (word1->len > 1) {
1187                     minSpace = 0;
1188                 }
1189                 delta = word0->primaryDelta(word1);
1190                 if (delta < minSpace) {
1191                     minSpace = delta;
1192                 }
1193             }
1194         }
1195         if (minSpace <= 0) {
1196             space = maxCharSpacing * words->fontSize;
1197         } else {
1198             space = maxWideCharSpacingMul * minSpace;
1199             if (space > maxWideCharSpacing * words->fontSize) {
1200                 space = maxWideCharSpacing * words->fontSize;
1201             }
1202         }
1203 
1204         // merge words
1205         word0 = words;
1206         word1 = words->next;
1207         while (word1) {
1208             if (word0->primaryDelta(word1) >= space) {
1209                 word0->spaceAfter = true;
1210                 word0 = word1;
1211                 word1 = word1->next;
1212             } else if (word0->font[word0->len - 1] == word1->font[0] && word0->underlined == word1->underlined && fabs(word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize
1213                        && word1->charPos[0] == word0->charPos[word0->len]) {
1214                 word0->merge(word1);
1215                 word0->next = word1->next;
1216                 delete word1;
1217                 word1 = word0->next;
1218             } else {
1219                 word0 = word1;
1220                 word1 = word1->next;
1221             }
1222         }
1223     }
1224 
1225     // build the line text
1226     isUnicode = uMap ? uMap->isUnicode() : false;
1227     len = 0;
1228     for (word1 = words; word1; word1 = word1->next) {
1229         len += word1->len;
1230         if (word1->spaceAfter) {
1231             ++len;
1232         }
1233     }
1234     text = (Unicode *)gmallocn(len, sizeof(Unicode));
1235     edge = (double *)gmallocn(len + 1, sizeof(double));
1236     i = 0;
1237     for (word1 = words; word1; word1 = word1->next) {
1238         for (j = 0; j < word1->len; ++j) {
1239             text[i] = word1->text[j];
1240             edge[i] = word1->edge[j];
1241             ++i;
1242         }
1243         edge[i] = word1->edge[word1->len];
1244         if (word1->spaceAfter) {
1245             text[i] = (Unicode)0x0020;
1246             ++i;
1247         }
1248     }
1249 
1250     // compute convertedLen and set up the col array
1251     col = (int *)gmallocn(len + 1, sizeof(int));
1252     convertedLen = 0;
1253     for (i = 0; i < len; ++i) {
1254         col[i] = convertedLen;
1255         if (isUnicode) {
1256             ++convertedLen;
1257         } else if (uMap) {
1258             convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
1259         }
1260     }
1261     col[len] = convertedLen;
1262 
1263     // check for hyphen at end of line
1264     //~ need to check for other chars used as hyphens
1265     hyphenated = text[len - 1] == (Unicode)'-';
1266 }
1267 
1268 //------------------------------------------------------------------------
1269 // TextLineFrag
1270 //------------------------------------------------------------------------
1271 
1272 class TextLineFrag
1273 {
1274 public:
1275     TextLine *line; // the line object
1276     int start, len; // offset and length of this fragment
1277                     //   (in Unicode chars)
1278     double xMin, xMax; // bounding box coordinates
1279     double yMin, yMax;
1280     double base; // baseline virtual coordinate
1281     int col; // first column
1282 
1283     void init(TextLine *lineA, int startA, int lenA);
1284     void computeCoords(bool oneRot);
1285 
1286     static int cmpYXPrimaryRot(const void *p1, const void *p2);
1287     static int cmpYXLineRot(const void *p1, const void *p2);
1288     static int cmpXYLineRot(const void *p1, const void *p2);
1289     static int cmpXYColumnPrimaryRot(const void *p1, const void *p2);
1290     static int cmpXYColumnLineRot(const void *p1, const void *p2);
1291 };
1292 
init(TextLine * lineA,int startA,int lenA)1293 void TextLineFrag::init(TextLine *lineA, int startA, int lenA)
1294 {
1295     line = lineA;
1296     start = startA;
1297     len = lenA;
1298     col = line->col[start];
1299 }
1300 
computeCoords(bool oneRot)1301 void TextLineFrag::computeCoords(bool oneRot)
1302 {
1303     TextBlock *blk;
1304     double d0, d1, d2, d3, d4;
1305 
1306     if (oneRot) {
1307 
1308         switch (line->rot) {
1309         case 0:
1310             xMin = line->edge[start];
1311             xMax = line->edge[start + len];
1312             yMin = line->yMin;
1313             yMax = line->yMax;
1314             break;
1315         case 1:
1316             xMin = line->xMin;
1317             xMax = line->xMax;
1318             yMin = line->edge[start];
1319             yMax = line->edge[start + len];
1320             break;
1321         case 2:
1322             xMin = line->edge[start + len];
1323             xMax = line->edge[start];
1324             yMin = line->yMin;
1325             yMax = line->yMax;
1326             break;
1327         case 3:
1328             xMin = line->xMin;
1329             xMax = line->xMax;
1330             yMin = line->edge[start + len];
1331             yMax = line->edge[start];
1332             break;
1333         }
1334         base = line->base;
1335 
1336     } else {
1337 
1338         if (line->rot == 0 && line->blk->page->primaryRot == 0) {
1339 
1340             xMin = line->edge[start];
1341             xMax = line->edge[start + len];
1342             yMin = line->yMin;
1343             yMax = line->yMax;
1344             base = line->base;
1345 
1346         } else {
1347 
1348             blk = line->blk;
1349             d0 = line->edge[start];
1350             d1 = line->edge[start + len];
1351             d2 = d3 = d4 = 0; // make gcc happy
1352 
1353             switch (line->rot) {
1354             case 0:
1355                 d2 = line->yMin;
1356                 d3 = line->yMax;
1357                 d4 = line->base;
1358                 d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
1359                 d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
1360                 d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
1361                 d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
1362                 d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
1363                 break;
1364             case 1:
1365                 d2 = line->xMax;
1366                 d3 = line->xMin;
1367                 d4 = line->base;
1368                 d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
1369                 d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
1370                 d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
1371                 d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
1372                 d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
1373                 break;
1374             case 2:
1375                 d2 = line->yMax;
1376                 d3 = line->yMin;
1377                 d4 = line->base;
1378                 d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
1379                 d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
1380                 d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
1381                 d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
1382                 d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
1383                 break;
1384             case 3:
1385                 d2 = line->xMin;
1386                 d3 = line->xMax;
1387                 d4 = line->base;
1388                 d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
1389                 d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
1390                 d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
1391                 d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
1392                 d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
1393                 break;
1394             }
1395 
1396             switch (line->blk->page->primaryRot) {
1397             case 0:
1398                 xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
1399                 xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
1400                 yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
1401                 yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
1402                 base = blk->yMin + d4 * (blk->yMax - blk->yMin);
1403                 break;
1404             case 1:
1405                 xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
1406                 xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
1407                 yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
1408                 yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
1409                 base = blk->xMax - d4 * (blk->xMax - blk->xMin);
1410                 break;
1411             case 2:
1412                 xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
1413                 xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
1414                 yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
1415                 yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
1416                 base = blk->yMax - d4 * (blk->yMax - blk->yMin);
1417                 break;
1418             case 3:
1419                 xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
1420                 xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
1421                 yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
1422                 yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
1423                 base = blk->xMin + d4 * (blk->xMax - blk->xMin);
1424                 break;
1425             }
1426         }
1427     }
1428 }
1429 
cmpYXPrimaryRot(const void * p1,const void * p2)1430 int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2)
1431 {
1432     TextLineFrag *frag1 = (TextLineFrag *)p1;
1433     TextLineFrag *frag2 = (TextLineFrag *)p2;
1434     double cmp;
1435 
1436     cmp = 0; // make gcc happy
1437     switch (frag1->line->blk->page->primaryRot) {
1438     case 0:
1439         if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) {
1440             cmp = frag1->xMin - frag2->xMin;
1441         }
1442         break;
1443     case 1:
1444         if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) {
1445             cmp = frag1->yMin - frag2->yMin;
1446         }
1447         break;
1448     case 2:
1449         if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) {
1450             cmp = frag2->xMax - frag1->xMax;
1451         }
1452         break;
1453     case 3:
1454         if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) {
1455             cmp = frag2->yMax - frag1->yMax;
1456         }
1457         break;
1458     }
1459     return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1460 }
1461 
cmpYXLineRot(const void * p1,const void * p2)1462 int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2)
1463 {
1464     TextLineFrag *frag1 = (TextLineFrag *)p1;
1465     TextLineFrag *frag2 = (TextLineFrag *)p2;
1466     double cmp;
1467 
1468     cmp = 0; // make gcc happy
1469     switch (frag1->line->rot) {
1470     case 0:
1471         if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1472             cmp = frag1->xMin - frag2->xMin;
1473         }
1474         break;
1475     case 1:
1476         if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1477             cmp = frag1->yMin - frag2->yMin;
1478         }
1479         break;
1480     case 2:
1481         if ((cmp = frag2->yMin - frag1->yMin) == 0) {
1482             cmp = frag2->xMax - frag1->xMax;
1483         }
1484         break;
1485     case 3:
1486         if ((cmp = frag1->xMax - frag2->xMax) == 0) {
1487             cmp = frag2->yMax - frag1->yMax;
1488         }
1489         break;
1490     }
1491     return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1492 }
1493 
cmpXYLineRot(const void * p1,const void * p2)1494 int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2)
1495 {
1496     TextLineFrag *frag1 = (TextLineFrag *)p1;
1497     TextLineFrag *frag2 = (TextLineFrag *)p2;
1498     double cmp;
1499 
1500     cmp = 0; // make gcc happy
1501     switch (frag1->line->rot) {
1502     case 0:
1503         if ((cmp = frag1->xMin - frag2->xMin) == 0) {
1504             cmp = frag1->yMin - frag2->yMin;
1505         }
1506         break;
1507     case 1:
1508         if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1509             cmp = frag2->xMax - frag1->xMax;
1510         }
1511         break;
1512     case 2:
1513         if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1514             cmp = frag2->yMin - frag1->yMin;
1515         }
1516         break;
1517     case 3:
1518         if ((cmp = frag2->yMax - frag1->yMax) == 0) {
1519             cmp = frag1->xMax - frag2->xMax;
1520         }
1521         break;
1522     }
1523     return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1524 }
1525 
cmpXYColumnPrimaryRot(const void * p1,const void * p2)1526 int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2)
1527 {
1528     TextLineFrag *frag1 = (TextLineFrag *)p1;
1529     TextLineFrag *frag2 = (TextLineFrag *)p2;
1530     double cmp;
1531 
1532     // if columns overlap, compare y values
1533     if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) {
1534         cmp = 0; // make gcc happy
1535         switch (frag1->line->blk->page->primaryRot) {
1536         case 0:
1537             cmp = frag1->yMin - frag2->yMin;
1538             break;
1539         case 1:
1540             cmp = frag2->xMax - frag1->xMax;
1541             break;
1542         case 2:
1543             cmp = frag2->yMin - frag1->yMin;
1544             break;
1545         case 3:
1546             cmp = frag1->xMax - frag2->xMax;
1547             break;
1548         }
1549         return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1550     }
1551 
1552     // otherwise, compare starting column
1553     return frag1->col - frag2->col;
1554 }
1555 
cmpXYColumnLineRot(const void * p1,const void * p2)1556 int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2)
1557 {
1558     TextLineFrag *frag1 = (TextLineFrag *)p1;
1559     TextLineFrag *frag2 = (TextLineFrag *)p2;
1560     double cmp;
1561 
1562     // if columns overlap, compare y values
1563     if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) {
1564         cmp = 0; // make gcc happy
1565         switch (frag1->line->rot) {
1566         case 0:
1567             cmp = frag1->yMin - frag2->yMin;
1568             break;
1569         case 1:
1570             cmp = frag2->xMax - frag1->xMax;
1571             break;
1572         case 2:
1573             cmp = frag2->yMin - frag1->yMin;
1574             break;
1575         case 3:
1576             cmp = frag1->xMax - frag2->xMax;
1577             break;
1578         }
1579         return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1580     }
1581 
1582     // otherwise, compare starting column
1583     return frag1->col - frag2->col;
1584 }
1585 
1586 //------------------------------------------------------------------------
1587 // TextBlock
1588 //------------------------------------------------------------------------
1589 
TextBlock(TextPage * pageA,int rotA)1590 TextBlock::TextBlock(TextPage *pageA, int rotA)
1591 {
1592     page = pageA;
1593     rot = rotA;
1594     xMin = yMin = 0;
1595     xMax = yMax = -1;
1596     priMin = 0;
1597     priMax = page->pageWidth;
1598     pool = new TextPool();
1599     lines = nullptr;
1600     curLine = nullptr;
1601     next = nullptr;
1602     stackNext = nullptr;
1603     tableId = -1;
1604     tableEnd = false;
1605 }
1606 
~TextBlock()1607 TextBlock::~TextBlock()
1608 {
1609     TextLine *line;
1610 
1611     delete pool;
1612     while (lines) {
1613         line = lines;
1614         lines = lines->next;
1615         delete line;
1616     }
1617 }
1618 
addWord(TextWord * word)1619 void TextBlock::addWord(TextWord *word)
1620 {
1621     pool->addWord(word);
1622     if (xMin > xMax) {
1623         xMin = word->xMin;
1624         xMax = word->xMax;
1625         yMin = word->yMin;
1626         yMax = word->yMax;
1627     } else {
1628         if (word->xMin < xMin) {
1629             xMin = word->xMin;
1630         }
1631         if (word->xMax > xMax) {
1632             xMax = word->xMax;
1633         }
1634         if (word->yMin < yMin) {
1635             yMin = word->yMin;
1636         }
1637         if (word->yMax > yMax) {
1638             yMax = word->yMax;
1639         }
1640     }
1641 }
1642 
coalesce(const UnicodeMap * uMap,double fixedPitch)1643 void TextBlock::coalesce(const UnicodeMap *uMap, double fixedPitch)
1644 {
1645     TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1646     TextLine *line, *line0, *line1;
1647     int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1648     int baseIdx, bestWordBaseIdx, idx0, idx1;
1649     double minBase, maxBase;
1650     double fontSize, wordSpacing, delta, priDelta, secDelta;
1651     TextLine **lineArray;
1652     bool found, overlap;
1653     int col1, col2;
1654     int i, j, k;
1655 
1656     // discard duplicated text (fake boldface, drop shadows)
1657     for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1658         word0 = pool->getPool(idx0);
1659         while (word0) {
1660             priDelta = dupMaxPriDelta * word0->fontSize;
1661             secDelta = dupMaxSecDelta * word0->fontSize;
1662             maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1663             found = false;
1664             word1 = word2 = nullptr; // make gcc happy
1665             for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1666                 if (idx1 == idx0) {
1667                     word1 = word0;
1668                     word2 = word0->next;
1669                 } else {
1670                     word1 = nullptr;
1671                     word2 = pool->getPool(idx1);
1672                 }
1673                 for (; word2; word1 = word2, word2 = word2->next) {
1674                     if (word2->len == word0->len && !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode))) {
1675                         switch (rot) {
1676                         case 0:
1677                         case 2:
1678                             found = fabs(word0->xMin - word2->xMin) < priDelta && fabs(word0->xMax - word2->xMax) < priDelta && fabs(word0->yMin - word2->yMin) < secDelta && fabs(word0->yMax - word2->yMax) < secDelta;
1679                             break;
1680                         case 1:
1681                         case 3:
1682                             found = fabs(word0->xMin - word2->xMin) < secDelta && fabs(word0->xMax - word2->xMax) < secDelta && fabs(word0->yMin - word2->yMin) < priDelta && fabs(word0->yMax - word2->yMax) < priDelta;
1683                             break;
1684                         }
1685                     }
1686                     if (found) {
1687                         break;
1688                     }
1689                 }
1690                 if (found) {
1691                     break;
1692                 }
1693             }
1694             if (found) {
1695                 if (word1) {
1696                     word1->next = word2->next;
1697                 } else {
1698                     pool->setPool(idx1, word2->next);
1699                 }
1700                 delete word2;
1701             } else {
1702                 word0 = word0->next;
1703             }
1704         }
1705     }
1706 
1707     // build the lines
1708     curLine = nullptr;
1709     poolMinBaseIdx = pool->minBaseIdx;
1710     charCount = 0;
1711     nLines = 0;
1712     while (true) {
1713 
1714         // find the first non-empty line in the pool
1715         for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx); ++poolMinBaseIdx)
1716             ;
1717         if (poolMinBaseIdx > pool->maxBaseIdx) {
1718             break;
1719         }
1720 
1721         // look for the left-most word in the first four lines of the
1722         // pool -- this avoids starting with a superscript word
1723         startBaseIdx = poolMinBaseIdx;
1724         for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) {
1725             if (!pool->getPool(baseIdx)) {
1726                 continue;
1727             }
1728             if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) < 0) {
1729                 startBaseIdx = baseIdx;
1730             }
1731         }
1732 
1733         // create a new line
1734         word0 = pool->getPool(startBaseIdx);
1735         pool->setPool(startBaseIdx, word0->next);
1736         word0->next = nullptr;
1737         line = new TextLine(this, word0->rot, word0->base);
1738         line->addWord(word0);
1739         lastWord = word0;
1740 
1741         // compute the search range
1742         fontSize = word0->fontSize;
1743         minBase = word0->base - maxIntraLineDelta * fontSize;
1744         maxBase = word0->base + maxIntraLineDelta * fontSize;
1745         minBaseIdx = pool->getBaseIdx(minBase);
1746         maxBaseIdx = pool->getBaseIdx(maxBase);
1747         wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize;
1748 
1749         // find the rest of the words in this line
1750         while (true) {
1751 
1752             // find the left-most word whose baseline is in the range for
1753             // this line
1754             bestWordBaseIdx = 0;
1755             bestWord0 = bestWord1 = nullptr;
1756             overlap = false;
1757             for (baseIdx = minBaseIdx; !overlap && baseIdx <= maxBaseIdx; ++baseIdx) {
1758                 for (word0 = nullptr, word1 = pool->getPool(baseIdx); word1; word0 = word1, word1 = word1->next) {
1759                     if (word1->base >= minBase && word1->base <= maxBase) {
1760                         delta = lastWord->primaryDelta(word1);
1761                         if (delta < minCharSpacing * fontSize) {
1762                             overlap = true;
1763                             break;
1764                         } else {
1765                             if (delta < wordSpacing && (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1766                                 bestWordBaseIdx = baseIdx;
1767                                 bestWord0 = word0;
1768                                 bestWord1 = word1;
1769                             }
1770                             break;
1771                         }
1772                     }
1773                 }
1774             }
1775             if (overlap || !bestWord1) {
1776                 break;
1777             }
1778 
1779             // remove it from the pool, and add it to the line
1780             if (bestWord0) {
1781                 bestWord0->next = bestWord1->next;
1782             } else {
1783                 pool->setPool(bestWordBaseIdx, bestWord1->next);
1784             }
1785             bestWord1->next = nullptr;
1786             line->addWord(bestWord1);
1787             lastWord = bestWord1;
1788         }
1789 
1790         // add the line
1791         if (curLine && line->cmpYX(curLine) > 0) {
1792             line0 = curLine;
1793             line1 = curLine->next;
1794         } else {
1795             line0 = nullptr;
1796             line1 = lines;
1797         }
1798         for (; line1 && line->cmpYX(line1) > 0; line0 = line1, line1 = line1->next)
1799             ;
1800         if (line0) {
1801             line0->next = line;
1802         } else {
1803             lines = line;
1804         }
1805         line->next = line1;
1806         curLine = line;
1807         line->coalesce(uMap);
1808         charCount += line->len;
1809         ++nLines;
1810     }
1811 
1812     // sort lines into xy order for column assignment
1813     lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *));
1814     for (line = lines, i = 0; line; line = line->next, ++i) {
1815         lineArray[i] = line;
1816     }
1817     qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1818 
1819     // column assignment
1820     nColumns = 0;
1821     if (fixedPitch) {
1822         for (i = 0; i < nLines; ++i) {
1823             line0 = lineArray[i];
1824             col1 = 0; // make gcc happy
1825             switch (rot) {
1826             case 0:
1827                 col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5);
1828                 break;
1829             case 1:
1830                 col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5);
1831                 break;
1832             case 2:
1833                 col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5);
1834                 break;
1835             case 3:
1836                 col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5);
1837                 break;
1838             }
1839             for (k = 0; k <= line0->len; ++k) {
1840                 line0->col[k] += col1;
1841             }
1842             if (line0->col[line0->len] > nColumns) {
1843                 nColumns = line0->col[line0->len];
1844             }
1845         }
1846     } else {
1847         for (i = 0; i < nLines; ++i) {
1848             line0 = lineArray[i];
1849             col1 = 0;
1850             for (j = 0; j < i; ++j) {
1851                 line1 = lineArray[j];
1852                 if (line1->primaryDelta(line0) >= 0) {
1853                     col2 = line1->col[line1->len] + 1;
1854                 } else {
1855                     k = 0; // make gcc happy
1856                     switch (rot) {
1857                     case 0:
1858                         for (k = 0; k < line1->len && line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k)
1859                             ;
1860                         break;
1861                     case 1:
1862                         for (k = 0; k < line1->len && line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k)
1863                             ;
1864                         break;
1865                     case 2:
1866                         for (k = 0; k < line1->len && line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k)
1867                             ;
1868                         break;
1869                     case 3:
1870                         for (k = 0; k < line1->len && line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k)
1871                             ;
1872                         break;
1873                     }
1874                     col2 = line1->col[k];
1875                 }
1876                 if (col2 > col1) {
1877                     col1 = col2;
1878                 }
1879             }
1880             for (k = 0; k <= line0->len; ++k) {
1881                 line0->col[k] += col1;
1882             }
1883             if (line0->col[line0->len] > nColumns) {
1884                 nColumns = line0->col[line0->len];
1885             }
1886         }
1887     }
1888     gfree(lineArray);
1889 }
1890 
updatePriMinMax(const TextBlock * blk)1891 void TextBlock::updatePriMinMax(const TextBlock *blk)
1892 {
1893     double newPriMin, newPriMax;
1894     bool gotPriMin, gotPriMax;
1895 
1896     gotPriMin = gotPriMax = false;
1897     newPriMin = newPriMax = 0; // make gcc happy
1898     switch (page->primaryRot) {
1899     case 0:
1900     case 2:
1901         if (blk->yMin < yMax && blk->yMax > yMin) {
1902             if (blk->xMin < xMin) {
1903                 newPriMin = blk->xMax;
1904                 gotPriMin = true;
1905             }
1906             if (blk->xMax > xMax) {
1907                 newPriMax = blk->xMin;
1908                 gotPriMax = true;
1909             }
1910         }
1911         break;
1912     case 1:
1913     case 3:
1914         if (blk->xMin < xMax && blk->xMax > xMin) {
1915             if (blk->yMin < yMin) {
1916                 newPriMin = blk->yMax;
1917                 gotPriMin = true;
1918             }
1919             if (blk->yMax > yMax) {
1920                 newPriMax = blk->yMin;
1921                 gotPriMax = true;
1922             }
1923         }
1924         break;
1925     }
1926     if (gotPriMin) {
1927         if (newPriMin > xMin) {
1928             newPriMin = xMin;
1929         }
1930         if (newPriMin > priMin) {
1931             priMin = newPriMin;
1932         }
1933     }
1934     if (gotPriMax) {
1935         if (newPriMax < xMax) {
1936             newPriMax = xMax;
1937         }
1938         if (newPriMax < priMax) {
1939             priMax = newPriMax;
1940         }
1941     }
1942 }
1943 
cmpXYPrimaryRot(const void * p1,const void * p2)1944 int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2)
1945 {
1946     TextBlock *blk1 = *(TextBlock **)p1;
1947     TextBlock *blk2 = *(TextBlock **)p2;
1948     double cmp;
1949 
1950     cmp = 0; // make gcc happy
1951     switch (blk1->page->primaryRot) {
1952     case 0:
1953         if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1954             cmp = blk1->yMin - blk2->yMin;
1955         }
1956         break;
1957     case 1:
1958         if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1959             cmp = blk2->xMax - blk1->xMax;
1960         }
1961         break;
1962     case 2:
1963         if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1964             cmp = blk2->yMin - blk1->yMin;
1965         }
1966         break;
1967     case 3:
1968         if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1969             cmp = blk1->xMax - blk2->xMax;
1970         }
1971         break;
1972     }
1973     return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1974 }
1975 
cmpYXPrimaryRot(const void * p1,const void * p2)1976 int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2)
1977 {
1978     TextBlock *blk1 = *(TextBlock **)p1;
1979     TextBlock *blk2 = *(TextBlock **)p2;
1980     double cmp;
1981 
1982     cmp = 0; // make gcc happy
1983     switch (blk1->page->primaryRot) {
1984     case 0:
1985         if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1986             cmp = blk1->xMin - blk2->xMin;
1987         }
1988         break;
1989     case 1:
1990         if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1991             cmp = blk1->yMin - blk2->yMin;
1992         }
1993         break;
1994     case 2:
1995         if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1996             cmp = blk2->xMax - blk1->xMax;
1997         }
1998         break;
1999     case 3:
2000         if ((cmp = blk1->xMax - blk2->xMax) == 0) {
2001             cmp = blk2->yMax - blk1->yMax;
2002         }
2003         break;
2004     }
2005     return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
2006 }
2007 
primaryCmp(const TextBlock * blk) const2008 int TextBlock::primaryCmp(const TextBlock *blk) const
2009 {
2010     double cmp;
2011 
2012     cmp = 0; // make gcc happy
2013     switch (rot) {
2014     case 0:
2015         cmp = xMin - blk->xMin;
2016         break;
2017     case 1:
2018         cmp = yMin - blk->yMin;
2019         break;
2020     case 2:
2021         cmp = blk->xMax - xMax;
2022         break;
2023     case 3:
2024         cmp = blk->yMax - yMax;
2025         break;
2026     }
2027     return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
2028 }
2029 
secondaryDelta(const TextBlock * blk) const2030 double TextBlock::secondaryDelta(const TextBlock *blk) const
2031 {
2032     double delta;
2033 
2034     delta = 0; // make gcc happy
2035     switch (rot) {
2036     case 0:
2037         delta = blk->yMin - yMax;
2038         break;
2039     case 1:
2040         delta = xMin - blk->xMax;
2041         break;
2042     case 2:
2043         delta = yMin - blk->yMax;
2044         break;
2045     case 3:
2046         delta = blk->xMin - xMax;
2047         break;
2048     }
2049     return delta;
2050 }
2051 
isBelow(const TextBlock * blk) const2052 bool TextBlock::isBelow(const TextBlock *blk) const
2053 {
2054     bool below;
2055 
2056     below = false; // make gcc happy
2057     switch (page->primaryRot) {
2058     case 0:
2059         below = xMin >= blk->priMin && xMax <= blk->priMax && yMin > blk->yMin;
2060         break;
2061     case 1:
2062         below = yMin >= blk->priMin && yMax <= blk->priMax && xMax < blk->xMax;
2063         break;
2064     case 2:
2065         below = xMin >= blk->priMin && xMax <= blk->priMax && yMax < blk->yMax;
2066         break;
2067     case 3:
2068         below = yMin >= blk->priMin && yMax <= blk->priMax && xMin > blk->xMin;
2069         break;
2070     }
2071 
2072     return below;
2073 }
2074 
isBeforeByRule1(const TextBlock * blk1)2075 bool TextBlock::isBeforeByRule1(const TextBlock *blk1)
2076 {
2077     bool before = false;
2078     bool overlap = false;
2079 
2080     switch (this->page->primaryRot) {
2081     case 0:
2082     case 2:
2083         overlap = ((this->ExMin <= blk1->ExMin) && (blk1->ExMin <= this->ExMax)) || ((blk1->ExMin <= this->ExMin) && (this->ExMin <= blk1->ExMax));
2084         break;
2085     case 1:
2086     case 3:
2087         overlap = ((this->EyMin <= blk1->EyMin) && (blk1->EyMin <= this->EyMax)) || ((blk1->EyMin <= this->EyMin) && (this->EyMin <= blk1->EyMax));
2088         break;
2089     }
2090     switch (this->page->primaryRot) {
2091     case 0:
2092         before = overlap && this->EyMin < blk1->EyMin;
2093         break;
2094     case 1:
2095         before = overlap && this->ExMax > blk1->ExMax;
2096         break;
2097     case 2:
2098         before = overlap && this->EyMax > blk1->EyMax;
2099         break;
2100     case 3:
2101         before = overlap && this->ExMin < blk1->ExMin;
2102         break;
2103     }
2104     return before;
2105 }
2106 
isBeforeByRule2(const TextBlock * blk1)2107 bool TextBlock::isBeforeByRule2(const TextBlock *blk1)
2108 {
2109     double cmp = 0;
2110     int rotLR = rot;
2111 
2112     if (!page->primaryLR) {
2113         rotLR = (rotLR + 2) % 4;
2114     }
2115 
2116     switch (rotLR) {
2117     case 0:
2118         cmp = ExMax - blk1->ExMin;
2119         break;
2120     case 1:
2121         cmp = EyMin - blk1->EyMax;
2122         break;
2123     case 2:
2124         cmp = blk1->ExMax - ExMin;
2125         break;
2126     case 3:
2127         cmp = blk1->EyMin - EyMax;
2128         break;
2129     }
2130     return cmp <= 0;
2131 }
2132 
2133 // Sort into reading order by performing a topological sort using the rules
2134 // given in "High Performance Document Layout Analysis", T.M. Breuel, 2003.
2135 // See http://pubs.iupr.org/#2003-breuel-sdiut
2136 // Topological sort is done by depth first search, see
2137 // http://en.wikipedia.org/wiki/Topological_sorting
visitDepthFirst(TextBlock * blkList,int pos1,TextBlock ** sorted,int sortPos,bool * visited,TextBlock ** cache,int cacheSize)2138 int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited, TextBlock **cache, int cacheSize)
2139 {
2140     int pos2;
2141     TextBlock *blk1, *blk2, *blk3;
2142     bool before;
2143 
2144     if (visited[pos1]) {
2145         return sortPos;
2146     }
2147 
2148     blk1 = this;
2149 
2150 #if 0 // for debugging
2151   printf("visited: %d %.2f..%.2f %.2f..%.2f\n",
2152 	 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2153 #endif
2154     visited[pos1] = true;
2155     pos2 = -1;
2156     for (blk2 = blkList; blk2; blk2 = blk2->next) {
2157         pos2++;
2158         if (visited[pos2]) {
2159             // skip visited nodes
2160             continue;
2161         }
2162         before = false;
2163 
2164         // is blk2 before blk1? (for table entries)
2165         if (blk1->tableId >= 0 && blk1->tableId == blk2->tableId) {
2166             if (page->primaryLR) {
2167                 if (blk2->xMax <= blk1->xMin && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin)
2168                     before = true;
2169             } else {
2170                 if (blk2->xMin >= blk1->xMax && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin)
2171                     before = true;
2172             }
2173 
2174             if (blk2->yMax <= blk1->yMin)
2175                 before = true;
2176         } else {
2177             if (blk2->isBeforeByRule1(blk1)) {
2178                 // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1.
2179                 before = true;
2180 #if 0 // for debugging
2181         printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2182 	       blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax,
2183 	       blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2184 #endif
2185             } else if (blk2->isBeforeByRule2(blk1)) {
2186                 // Rule (2) blk2 left of blk1, and no intervening blk3
2187                 //          such that blk1 is before blk3 by rule 1,
2188                 //          and blk3 is before blk2 by rule 1.
2189                 before = true;
2190                 for (int i = 0; i < cacheSize && cache[i]; ++i) {
2191                     if (blk1->isBeforeByRule1(cache[i]) && cache[i]->isBeforeByRule1(blk2)) {
2192                         before = false;
2193                         std::rotate(cache, cache + i, cache + i + 1);
2194                         break;
2195                     }
2196                 }
2197 
2198                 if (before) {
2199                     for (blk3 = blkList; blk3; blk3 = blk3->next) {
2200                         if (blk3 == blk2 || blk3 == blk1) {
2201                             continue;
2202                         }
2203                         if (blk1->isBeforeByRule1(blk3) && blk3->isBeforeByRule1(blk2)) {
2204                             before = false;
2205                             std::copy_backward(cache, cache + cacheSize - 1, cache + cacheSize);
2206                             cache[0] = blk3;
2207                             break;
2208                         }
2209                     }
2210                 }
2211 #if 0 // for debugging
2212         if (before) {
2213 	  printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2214 	         blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax,
2215 	         blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax);
2216         }
2217 #endif
2218             }
2219         }
2220         if (before) {
2221             // blk2 is before blk1, so it needs to be visited
2222             // before we can add blk1 to the sorted list.
2223             sortPos = blk2->visitDepthFirst(blkList, pos2, sorted, sortPos, visited, cache, cacheSize);
2224         }
2225     }
2226 #if 0 // for debugging
2227   printf("sorted: %d %.2f..%.2f %.2f..%.2f\n",
2228 	 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2229 #endif
2230     sorted[sortPos++] = blk1;
2231     return sortPos;
2232 }
2233 
visitDepthFirst(TextBlock * blkList,int pos1,TextBlock ** sorted,int sortPos,bool * visited)2234 int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited)
2235 {
2236     const int blockCacheSize = 4;
2237     TextBlock *blockCache[blockCacheSize];
2238     std::fill(blockCache, blockCache + blockCacheSize, nullptr);
2239     return visitDepthFirst(blkList, pos1, sorted, sortPos, visited, blockCache, blockCacheSize);
2240 }
2241 
2242 //------------------------------------------------------------------------
2243 // TextFlow
2244 //------------------------------------------------------------------------
2245 
TextFlow(TextPage * pageA,TextBlock * blk)2246 TextFlow::TextFlow(TextPage *pageA, TextBlock *blk)
2247 {
2248     page = pageA;
2249     xMin = blk->xMin;
2250     xMax = blk->xMax;
2251     yMin = blk->yMin;
2252     yMax = blk->yMax;
2253     priMin = blk->priMin;
2254     priMax = blk->priMax;
2255     blocks = lastBlk = blk;
2256     next = nullptr;
2257 }
2258 
~TextFlow()2259 TextFlow::~TextFlow()
2260 {
2261     TextBlock *blk;
2262 
2263     while (blocks) {
2264         blk = blocks;
2265         blocks = blocks->next;
2266         delete blk;
2267     }
2268 }
2269 
addBlock(TextBlock * blk)2270 void TextFlow::addBlock(TextBlock *blk)
2271 {
2272     if (lastBlk) {
2273         lastBlk->next = blk;
2274     } else {
2275         blocks = blk;
2276     }
2277     lastBlk = blk;
2278     if (blk->xMin < xMin) {
2279         xMin = blk->xMin;
2280     }
2281     if (blk->xMax > xMax) {
2282         xMax = blk->xMax;
2283     }
2284     if (blk->yMin < yMin) {
2285         yMin = blk->yMin;
2286     }
2287     if (blk->yMax > yMax) {
2288         yMax = blk->yMax;
2289     }
2290 }
2291 
blockFits(const TextBlock * blk,const TextBlock * prevBlk) const2292 bool TextFlow::blockFits(const TextBlock *blk, const TextBlock *prevBlk) const
2293 {
2294     bool fits;
2295 
2296     // lower blocks must use smaller fonts
2297     if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
2298         return false;
2299     }
2300 
2301     fits = false; // make gcc happy
2302     switch (page->primaryRot) {
2303     case 0:
2304         fits = blk->xMin >= priMin && blk->xMax <= priMax;
2305         break;
2306     case 1:
2307         fits = blk->yMin >= priMin && blk->yMax <= priMax;
2308         break;
2309     case 2:
2310         fits = blk->xMin >= priMin && blk->xMax <= priMax;
2311         break;
2312     case 3:
2313         fits = blk->yMin >= priMin && blk->yMax <= priMax;
2314         break;
2315     }
2316     return fits;
2317 }
2318 
2319 #ifdef TEXTOUT_WORD_LIST
2320 
2321 //------------------------------------------------------------------------
2322 // TextWordList
2323 //------------------------------------------------------------------------
2324 
TextWordList(const TextPage * text,bool physLayout)2325 TextWordList::TextWordList(const TextPage *text, bool physLayout)
2326 {
2327     TextFlow *flow;
2328     TextBlock *blk;
2329     TextLine *line;
2330     TextWord *word;
2331     TextWord **wordArray;
2332     int nWords, i;
2333 
2334     if (text->rawOrder) {
2335         for (word = text->rawWords; word; word = word->next) {
2336             words.push_back(word);
2337         }
2338 
2339     } else if (physLayout) {
2340         // this is inefficient, but it's also the least useful of these
2341         // three cases
2342         nWords = 0;
2343         for (flow = text->flows; flow; flow = flow->next) {
2344             for (blk = flow->blocks; blk; blk = blk->next) {
2345                 for (line = blk->lines; line; line = line->next) {
2346                     for (word = line->words; word; word = word->next) {
2347                         ++nWords;
2348                     }
2349                 }
2350             }
2351         }
2352         wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *));
2353         i = 0;
2354         for (flow = text->flows; flow; flow = flow->next) {
2355             for (blk = flow->blocks; blk; blk = blk->next) {
2356                 for (line = blk->lines; line; line = line->next) {
2357                     for (word = line->words; word; word = word->next) {
2358                         wordArray[i++] = word;
2359                     }
2360                 }
2361             }
2362         }
2363         qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
2364         for (i = 0; i < nWords; ++i) {
2365             words.push_back(wordArray[i]);
2366         }
2367         gfree(wordArray);
2368 
2369     } else {
2370         for (flow = text->flows; flow; flow = flow->next) {
2371             for (blk = flow->blocks; blk; blk = blk->next) {
2372                 for (line = blk->lines; line; line = line->next) {
2373                     for (word = line->words; word; word = word->next) {
2374                         words.push_back(word);
2375                     }
2376                 }
2377             }
2378         }
2379     }
2380 }
2381 
~TextWordList()2382 TextWordList::~TextWordList() { }
2383 
getLength() const2384 int TextWordList::getLength() const
2385 {
2386     return words.size();
2387 }
2388 
get(int idx)2389 TextWord *TextWordList::get(int idx)
2390 {
2391     if (idx < 0 || idx >= (int)words.size()) {
2392         return nullptr;
2393     }
2394     return words[idx];
2395 }
2396 
2397 #endif // TEXTOUT_WORD_LIST
2398 
2399 //------------------------------------------------------------------------
2400 // TextPage
2401 //------------------------------------------------------------------------
2402 
TextPage(bool rawOrderA,bool discardDiagA)2403 TextPage::TextPage(bool rawOrderA, bool discardDiagA)
2404 {
2405     int rot;
2406 
2407     refCnt = 1;
2408     rawOrder = rawOrderA;
2409     discardDiag = discardDiagA;
2410     curWord = nullptr;
2411     charPos = 0;
2412     curFont = nullptr;
2413     curFontSize = 0;
2414     nest = 0;
2415     nTinyChars = 0;
2416     lastCharOverlap = false;
2417     if (!rawOrder) {
2418         for (rot = 0; rot < 4; ++rot) {
2419             pools[rot] = std::make_unique<TextPool>();
2420         }
2421     }
2422     flows = nullptr;
2423     blocks = nullptr;
2424     rawWords = nullptr;
2425     rawLastWord = nullptr;
2426     lastFindXMin = lastFindYMin = 0;
2427     haveLastFind = false;
2428     mergeCombining = true;
2429     diagonal = false;
2430 }
2431 
~TextPage()2432 TextPage::~TextPage()
2433 {
2434     clear();
2435 }
2436 
incRefCnt()2437 void TextPage::incRefCnt()
2438 {
2439     refCnt++;
2440 }
2441 
decRefCnt()2442 void TextPage::decRefCnt()
2443 {
2444     if (--refCnt == 0)
2445         delete this;
2446 }
2447 
startPage(const GfxState * state)2448 void TextPage::startPage(const GfxState *state)
2449 {
2450     clear();
2451     if (state) {
2452         pageWidth = state->getPageWidth();
2453         pageHeight = state->getPageHeight();
2454     } else {
2455         pageWidth = pageHeight = 0;
2456     }
2457 }
2458 
endPage()2459 void TextPage::endPage()
2460 {
2461     if (curWord) {
2462         endWord();
2463     }
2464 }
2465 
clear()2466 void TextPage::clear()
2467 {
2468     int rot;
2469     TextFlow *flow;
2470     TextWord *word;
2471 
2472     if (curWord) {
2473         delete curWord;
2474         curWord = nullptr;
2475     }
2476     if (rawOrder) {
2477         while (rawWords) {
2478             word = rawWords;
2479             rawWords = rawWords->next;
2480             delete word;
2481         }
2482     } else {
2483         for (rot = 0; rot < 4; ++rot) {
2484             pools[rot] = std::make_unique<TextPool>();
2485         }
2486         while (flows) {
2487             flow = flows;
2488             flows = flows->next;
2489             delete flow;
2490         }
2491         gfree(blocks);
2492     }
2493     fonts.clear();
2494     underlines.clear();
2495     links.clear();
2496 
2497     diagonal = false;
2498     curWord = nullptr;
2499     charPos = 0;
2500     curFont = nullptr;
2501     curFontSize = 0;
2502     nest = 0;
2503     nTinyChars = 0;
2504     flows = nullptr;
2505     blocks = nullptr;
2506     rawWords = nullptr;
2507     rawLastWord = nullptr;
2508 }
2509 
updateFont(const GfxState * state)2510 void TextPage::updateFont(const GfxState *state)
2511 {
2512     GfxFont *gfxFont;
2513     const double *fm;
2514     const char *name;
2515     int code, mCode, letterCode, anyCode;
2516     double w;
2517 
2518     // get the font info object
2519     curFont = nullptr;
2520     for (const std::unique_ptr<TextFontInfo> &f : fonts) {
2521         if (f->matches(state)) {
2522             curFont = f.get();
2523             break;
2524         }
2525     }
2526     if (!curFont) {
2527         fonts.emplace_back(std::make_unique<TextFontInfo>(state));
2528         curFont = fonts.back().get();
2529     }
2530 
2531     // adjust the font size
2532     gfxFont = state->getFont();
2533     curFontSize = state->getTransformedFontSize();
2534     if (gfxFont && gfxFont->getType() == fontType3) {
2535         // This is a hack which makes it possible to deal with some Type 3
2536         // fonts.  The problem is that it's impossible to know what the
2537         // base coordinate system used in the font is without actually
2538         // rendering the font.  This code tries to guess by looking at the
2539         // width of the character 'm' (which breaks if the font is a
2540         // subset that doesn't contain 'm').
2541         mCode = letterCode = anyCode = -1;
2542         for (code = 0; code < 256; ++code) {
2543             name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
2544             int nameLen = name ? strlen(name) : 0;
2545             bool nameOneChar = nameLen == 1 || (nameLen > 1 && name[1] == '\0');
2546             if (nameOneChar && name[0] == 'm') {
2547                 mCode = code;
2548             }
2549             if (letterCode < 0 && nameOneChar && ((name[0] >= 'A' && name[0] <= 'Z') || (name[0] >= 'a' && name[0] <= 'z'))) {
2550                 letterCode = code;
2551             }
2552             if (anyCode < 0 && name && ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
2553                 anyCode = code;
2554             }
2555         }
2556         if (mCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
2557             // 0.6 is a generic average 'm' width -- yes, this is a hack
2558             curFontSize *= w / 0.6;
2559         } else if (letterCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
2560             // even more of a hack: 0.5 is a generic letter width
2561             curFontSize *= w / 0.5;
2562         } else if (anyCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
2563             // better than nothing: 0.5 is a generic character width
2564             curFontSize *= w / 0.5;
2565         }
2566         fm = gfxFont->getFontMatrix();
2567         if (fm[0] != 0) {
2568             curFontSize *= fabs(fm[3] / fm[0]);
2569         }
2570     }
2571 }
2572 
beginWord(const GfxState * state)2573 void TextPage::beginWord(const GfxState *state)
2574 {
2575     GfxFont *gfxFont;
2576     const double *fontm;
2577     double m[4], m2[4];
2578     int rot;
2579 
2580     // This check is needed because Type 3 characters can contain
2581     // text-drawing operations (when TextPage is being used via
2582     // {X,Win}SplashOutputDev rather than TextOutputDev).
2583     if (curWord) {
2584         ++nest;
2585         return;
2586     }
2587 
2588     // compute the rotation
2589     state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
2590     gfxFont = state->getFont();
2591     if (gfxFont && gfxFont->getType() == fontType3) {
2592         fontm = state->getFont()->getFontMatrix();
2593         m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
2594         m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
2595         m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
2596         m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
2597         m[0] = m2[0];
2598         m[1] = m2[1];
2599         m[2] = m2[2];
2600         m[3] = m2[3];
2601     }
2602     if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
2603         rot = (m[0] > 0 || m[3] < 0) ? 0 : 2;
2604     } else {
2605         rot = (m[2] > 0) ? 1 : 3;
2606     }
2607     if (fabs(m[0]) >= fabs(m[1])) {
2608         diagonal = fabs(m[1]) > diagonalThreshold * fabs(m[0]);
2609     } else {
2610         diagonal = fabs(m[0]) > diagonalThreshold * fabs(m[1]);
2611     }
2612 
2613     // for vertical writing mode, the lines are effectively rotated 90
2614     // degrees
2615     if (gfxFont && gfxFont->getWMode()) {
2616         rot = (rot + 1) & 3;
2617     }
2618 
2619     curWord = new TextWord(state, rot, curFontSize);
2620 }
2621 
addChar(const GfxState * state,double x,double y,double dx,double dy,CharCode c,int nBytes,const Unicode * u,int uLen)2622 void TextPage::addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen)
2623 {
2624     double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
2625     bool overlap;
2626     int i;
2627     int wMode;
2628     Matrix mat;
2629 
2630     // subtract char and word spacing from the dx,dy values
2631     sp = state->getCharSpace();
2632     if (c == (CharCode)0x20) {
2633         sp += state->getWordSpace();
2634     }
2635     state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
2636     dx -= dx2;
2637     dy -= dy2;
2638     state->transformDelta(dx, dy, &w1, &h1);
2639 
2640     // throw away chars that aren't inside the page bounds
2641     // (and also do a sanity check on the character size)
2642     state->transform(x, y, &x1, &y1);
2643     if (x1 + w1 < 0 || x1 > pageWidth || y1 + h1 < 0 || y1 > pageHeight || x1 != x1 || y1 != y1 || // IEEE way of checking for isnan
2644         w1 != w1 || h1 != h1) {
2645         charPos += nBytes;
2646         return;
2647     }
2648 
2649     // check the tiny chars limit
2650     if (fabs(w1) < 3 && fabs(h1) < 3) {
2651         if (++nTinyChars > 50000) {
2652             charPos += nBytes;
2653             return;
2654         }
2655     }
2656 
2657     // break words at space character
2658     if (uLen == 1 && UnicodeIsWhitespace(u[0])) {
2659         charPos += nBytes;
2660         endWord();
2661         return;
2662     } else if (uLen == 1 && u[0] == (Unicode)0x0) {
2663         // ignore null characters
2664         charPos += nBytes;
2665         return;
2666     }
2667 
2668     state->getFontTransMat(&mat.m[0], &mat.m[1], &mat.m[2], &mat.m[3]);
2669     mat.m[0] *= state->getHorizScaling();
2670     mat.m[1] *= state->getHorizScaling();
2671     mat.m[4] = x1;
2672     mat.m[5] = y1;
2673 
2674     if (mergeCombining && curWord && uLen == 1 && curWord->addCombining(state, curFont, curFontSize, x1, y1, w1, h1, charPos, nBytes, c, u[0], mat)) {
2675         charPos += nBytes;
2676         return;
2677     }
2678 
2679     // start a new word if:
2680     // (1) this character doesn't fall in the right place relative to
2681     //     the end of the previous word (this places upper and lower
2682     //     constraints on the position deltas along both the primary
2683     //     and secondary axes), or
2684     // (2) this character overlaps the previous one (duplicated text), or
2685     // (3) the previous character was an overlap (we want each duplicated
2686     //     character to be in a word by itself at this stage),
2687     // (4) the font size has changed
2688     // (5) the WMode changed
2689     if (curWord && curWord->len > 0) {
2690         base = sp = delta = 0; // make gcc happy
2691         switch (curWord->rot) {
2692         case 0:
2693             base = y1;
2694             sp = x1 - curWord->xMax;
2695             delta = x1 - curWord->edge[curWord->len - 1];
2696             break;
2697         case 1:
2698             base = x1;
2699             sp = y1 - curWord->yMax;
2700             delta = y1 - curWord->edge[curWord->len - 1];
2701             break;
2702         case 2:
2703             base = y1;
2704             sp = curWord->xMin - x1;
2705             delta = curWord->edge[curWord->len - 1] - x1;
2706             break;
2707         case 3:
2708             base = x1;
2709             sp = curWord->yMin - y1;
2710             delta = curWord->edge[curWord->len - 1] - y1;
2711             break;
2712         }
2713         overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize && fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
2714         wMode = curFont->getWMode();
2715         if (overlap || lastCharOverlap || sp < -minDupBreakOverlap * curWord->fontSize || sp > minWordBreakSpace * curWord->fontSize || fabs(base - curWord->base) > 0.5 || curFontSize != curWord->fontSize || wMode != curWord->wMode) {
2716             endWord();
2717         }
2718         lastCharOverlap = overlap;
2719     } else {
2720         lastCharOverlap = false;
2721     }
2722 
2723     if (uLen != 0) {
2724         // start a new word if needed
2725         if (!curWord) {
2726             beginWord(state);
2727         }
2728 
2729         // throw away diagonal chars
2730         if (discardDiag && diagonal) {
2731             charPos += nBytes;
2732             return;
2733         }
2734 
2735         // page rotation and/or transform matrices can cause text to be
2736         // drawn in reverse order -- in this case, swap the begin/end
2737         // coordinates and break text into individual chars
2738         if ((curWord->rot == 0 && w1 < 0) || (curWord->rot == 1 && h1 < 0) || (curWord->rot == 2 && w1 > 0) || (curWord->rot == 3 && h1 > 0)) {
2739             endWord();
2740             beginWord(state);
2741 
2742             // throw away diagonal chars
2743             if (discardDiag && diagonal) {
2744                 charPos += nBytes;
2745                 return;
2746             }
2747 
2748             x1 += w1;
2749             y1 += h1;
2750             w1 = -w1;
2751             h1 = -h1;
2752         }
2753 
2754         // add the characters to the current word
2755         w1 /= uLen;
2756         h1 /= uLen;
2757         for (i = 0; i < uLen; ++i) {
2758             curWord->addChar(state, curFont, x1 + i * w1, y1 + i * h1, w1, h1, charPos, nBytes, c, u[i], mat);
2759         }
2760     }
2761     charPos += nBytes;
2762 }
2763 
incCharCount(int nChars)2764 void TextPage::incCharCount(int nChars)
2765 {
2766     charPos += nChars;
2767 }
2768 
endWord()2769 void TextPage::endWord()
2770 {
2771     // This check is needed because Type 3 characters can contain
2772     // text-drawing operations (when TextPage is being used via
2773     // {X,Win}SplashOutputDev rather than TextOutputDev).
2774     if (nest > 0) {
2775         --nest;
2776         return;
2777     }
2778 
2779     if (curWord) {
2780         addWord(curWord);
2781         curWord = nullptr;
2782     }
2783 }
2784 
addWord(TextWord * word)2785 void TextPage::addWord(TextWord *word)
2786 {
2787     // throw away zero-length words -- they don't have valid xMin/xMax
2788     // values, and they're useless anyway
2789     if (word->len == 0) {
2790         delete word;
2791         return;
2792     }
2793 
2794     if (rawOrder) {
2795         if (rawLastWord) {
2796             rawLastWord->next = word;
2797         } else {
2798             rawWords = word;
2799         }
2800         rawLastWord = word;
2801     } else {
2802         pools[word->rot]->addWord(word);
2803     }
2804 }
2805 
addUnderline(double x0,double y0,double x1,double y1)2806 void TextPage::addUnderline(double x0, double y0, double x1, double y1)
2807 {
2808     underlines.emplace_back(std::make_unique<TextUnderline>(x0, y0, x1, y1));
2809 }
2810 
addLink(int xMin,int yMin,int xMax,int yMax,AnnotLink * link)2811 void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link)
2812 {
2813     links.emplace_back(std::make_unique<TextLink>(xMin, yMin, xMax, yMax, link));
2814 }
2815 
coalesce(bool physLayout,double fixedPitch,bool doHTML)2816 void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML)
2817 {
2818     TextWord *word0, *word1, *word2;
2819     TextLine *line;
2820     TextBlock *blkList, *blk, *lastBlk, *blk0, *blk1, *blk2;
2821     TextFlow *flow, *lastFlow;
2822     int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx;
2823     double minBase, maxBase, newMinBase, newMaxBase;
2824     double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
2825     bool found;
2826     int count[4];
2827     int lrCount;
2828     int col1, col2;
2829     int j, n;
2830 
2831     if (rawOrder) {
2832         primaryRot = 0;
2833         primaryLR = true;
2834         return;
2835     }
2836 
2837     const UnicodeMap *uMap = globalParams->getTextEncoding();
2838     blkList = nullptr;
2839     lastBlk = nullptr;
2840     nBlocks = 0;
2841     primaryRot = 0;
2842 
2843 #if 0 // for debugging
2844   printf("*** initial words ***\n");
2845   for (rot = 0; rot < 4; ++rot) {
2846     pool = pools[rot];
2847     for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
2848       for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
2849 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2850 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2851 	       word0->base, word0->fontSize, rot*90, word0->link);
2852 	for (i = 0; i < word0->len; ++i) {
2853 	  fputc(word0->text[i] & 0xff, stdout);
2854 	}
2855 	printf("'\n");
2856       }
2857     }
2858   }
2859   printf("\n");
2860 #endif
2861 
2862 #if 0 //~ for debugging
2863   for (i = 0; i < underlines->getLength(); ++i) {
2864     underline = (TextUnderline *)underlines->get(i);
2865     printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2866 	   underline->x0, underline->x1, underline->y0, underline->y1,
2867 	   underline->horiz);
2868   }
2869 #endif
2870 
2871     if (doHTML) {
2872 
2873         //----- handle underlining
2874         for (const std::unique_ptr<TextUnderline> &underline : underlines) {
2875             if (underline->horiz) {
2876                 // rot = 0
2877                 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2878                     startBaseIdx = pools[0]->getBaseIdx(underline->y0 + minUnderlineGap);
2879                     endBaseIdx = pools[0]->getBaseIdx(underline->y0 + maxUnderlineGap);
2880                     for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2881                         for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2882                             //~ need to check the y value against the word baseline
2883                             if (underline->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline->x1) {
2884                                 word0->underlined = true;
2885                             }
2886                         }
2887                     }
2888                 }
2889 
2890                 // rot = 2
2891                 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2892                     startBaseIdx = pools[2]->getBaseIdx(underline->y0 - maxUnderlineGap);
2893                     endBaseIdx = pools[2]->getBaseIdx(underline->y0 - minUnderlineGap);
2894                     for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2895                         for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2896                             if (underline->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline->x1) {
2897                                 word0->underlined = true;
2898                             }
2899                         }
2900                     }
2901                 }
2902             } else {
2903                 // rot = 1
2904                 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2905                     startBaseIdx = pools[1]->getBaseIdx(underline->x0 - maxUnderlineGap);
2906                     endBaseIdx = pools[1]->getBaseIdx(underline->x0 - minUnderlineGap);
2907                     for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2908                         for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2909                             if (underline->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline->y1) {
2910                                 word0->underlined = true;
2911                             }
2912                         }
2913                     }
2914                 }
2915 
2916                 // rot = 3
2917                 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2918                     startBaseIdx = pools[3]->getBaseIdx(underline->x0 + minUnderlineGap);
2919                     endBaseIdx = pools[3]->getBaseIdx(underline->x0 + maxUnderlineGap);
2920                     for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2921                         for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2922                             if (underline->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline->y1) {
2923                                 word0->underlined = true;
2924                             }
2925                         }
2926                     }
2927                 }
2928             }
2929         }
2930 
2931         //----- handle links
2932         for (const std::unique_ptr<TextLink> &link : links) {
2933             // rot = 0
2934             if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2935                 startBaseIdx = pools[0]->getBaseIdx(link->yMin);
2936                 endBaseIdx = pools[0]->getBaseIdx(link->yMax);
2937                 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2938                     for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2939                         if (link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax && link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax) {
2940                             word0->link = link->link;
2941                         }
2942                     }
2943                 }
2944             }
2945 
2946             // rot = 2
2947             if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2948                 startBaseIdx = pools[2]->getBaseIdx(link->yMin);
2949                 endBaseIdx = pools[2]->getBaseIdx(link->yMax);
2950                 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2951                     for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2952                         if (link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax && link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax) {
2953                             word0->link = link->link;
2954                         }
2955                     }
2956                 }
2957             }
2958 
2959             // rot = 1
2960             if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2961                 startBaseIdx = pools[1]->getBaseIdx(link->xMin);
2962                 endBaseIdx = pools[1]->getBaseIdx(link->xMax);
2963                 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2964                     for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2965                         if (link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax && link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax) {
2966                             word0->link = link->link;
2967                         }
2968                     }
2969                 }
2970             }
2971 
2972             // rot = 3
2973             if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2974                 startBaseIdx = pools[3]->getBaseIdx(link->xMin);
2975                 endBaseIdx = pools[3]->getBaseIdx(link->xMax);
2976                 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2977                     for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2978                         if (link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax && link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax) {
2979                             word0->link = link->link;
2980                         }
2981                     }
2982                 }
2983             }
2984         }
2985     }
2986 
2987     //----- assemble the blocks
2988 
2989     //~ add an outer loop for writing mode (vertical text)
2990 
2991     // build blocks for each rotation value
2992     for (rot = 0; rot < 4; ++rot) {
2993         std::unique_ptr<TextPool> &pool = pools[rot];
2994         poolMinBaseIdx = pool->minBaseIdx;
2995         count[rot] = 0;
2996 
2997         // add blocks until no more words are left
2998         while (true) {
2999 
3000             // find the first non-empty line in the pool
3001             for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx); ++poolMinBaseIdx)
3002                 ;
3003             if (poolMinBaseIdx > pool->maxBaseIdx) {
3004                 break;
3005             }
3006 
3007             // look for the left-most word in the first four lines of the
3008             // pool -- this avoids starting with a superscript word
3009             startBaseIdx = poolMinBaseIdx;
3010             for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) {
3011                 if (!pool->getPool(baseIdx)) {
3012                     continue;
3013                 }
3014                 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) < 0) {
3015                     startBaseIdx = baseIdx;
3016                 }
3017             }
3018 
3019             // create a new block
3020             word0 = pool->getPool(startBaseIdx);
3021             pool->setPool(startBaseIdx, word0->next);
3022             word0->next = nullptr;
3023             blk = new TextBlock(this, rot);
3024             blk->addWord(word0);
3025 
3026             fontSize = word0->fontSize;
3027             minBase = maxBase = word0->base;
3028             colSpace1 = minColSpacing1 * fontSize;
3029             colSpace2 = minColSpacing2 * fontSize;
3030             lineSpace = maxLineSpacingDelta * fontSize;
3031             intraLineSpace = maxIntraLineDelta * fontSize;
3032 
3033             // add words to the block
3034             do {
3035                 found = false;
3036 
3037                 // look for words on the line above the current top edge of
3038                 // the block
3039                 newMinBase = minBase;
3040                 for (baseIdx = pool->getBaseIdx(minBase); baseIdx >= pool->getBaseIdx(minBase - lineSpace); --baseIdx) {
3041                     word0 = nullptr;
3042                     word1 = pool->getPool(baseIdx);
3043                     while (word1) {
3044                         if (word1->base < minBase && word1->base >= minBase - lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin))
3045                             && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) {
3046                             word2 = word1;
3047                             if (word0) {
3048                                 word0->next = word1->next;
3049                             } else {
3050                                 pool->setPool(baseIdx, word1->next);
3051                             }
3052                             word1 = word1->next;
3053                             word2->next = nullptr;
3054                             blk->addWord(word2);
3055                             found = true;
3056                             newMinBase = word2->base;
3057                         } else {
3058                             word0 = word1;
3059                             word1 = word1->next;
3060                         }
3061                     }
3062                 }
3063                 minBase = newMinBase;
3064 
3065                 // look for words on the line below the current bottom edge of
3066                 // the block
3067                 newMaxBase = maxBase;
3068                 for (baseIdx = pool->getBaseIdx(maxBase); baseIdx <= pool->getBaseIdx(maxBase + lineSpace); ++baseIdx) {
3069                     word0 = nullptr;
3070                     word1 = pool->getPool(baseIdx);
3071                     while (word1) {
3072                         if (word1->base > maxBase && word1->base <= maxBase + lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin))
3073                             && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) {
3074                             word2 = word1;
3075                             if (word0) {
3076                                 word0->next = word1->next;
3077                             } else {
3078                                 pool->setPool(baseIdx, word1->next);
3079                             }
3080                             word1 = word1->next;
3081                             word2->next = nullptr;
3082                             blk->addWord(word2);
3083                             found = true;
3084                             newMaxBase = word2->base;
3085                         } else {
3086                             word0 = word1;
3087                             word1 = word1->next;
3088                         }
3089                     }
3090                 }
3091                 maxBase = newMaxBase;
3092 
3093                 // look for words that are on lines already in the block, and
3094                 // that overlap the block horizontally
3095                 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3096                     word0 = nullptr;
3097                     word1 = pool->getPool(baseIdx);
3098                     while (word1) {
3099                         if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3100                             && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax + colSpace1 && word1->xMax > blk->xMin - colSpace1) : (word1->yMin < blk->yMax + colSpace1 && word1->yMax > blk->yMin - colSpace1))
3101                             && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta2 * fontSize) {
3102                             word2 = word1;
3103                             if (word0) {
3104                                 word0->next = word1->next;
3105                             } else {
3106                                 pool->setPool(baseIdx, word1->next);
3107                             }
3108                             word1 = word1->next;
3109                             word2->next = nullptr;
3110                             blk->addWord(word2);
3111                             found = true;
3112                         } else {
3113                             word0 = word1;
3114                             word1 = word1->next;
3115                         }
3116                     }
3117                 }
3118 
3119                 // only check for outlying words (the next two chunks of code)
3120                 // if we didn't find anything else
3121                 if (found) {
3122                     continue;
3123                 }
3124 
3125                 // scan down the left side of the block, looking for words
3126                 // that are near (but not overlapping) the block; if there are
3127                 // three or fewer, add them to the block
3128                 n = 0;
3129                 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3130                     word1 = pool->getPool(baseIdx);
3131                     while (word1) {
3132                         if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3133                             && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2))
3134                             && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3135                             ++n;
3136                             break;
3137                         }
3138                         word1 = word1->next;
3139                     }
3140                 }
3141                 if (n > 0 && n <= 3) {
3142                     for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3143                         word0 = nullptr;
3144                         word1 = pool->getPool(baseIdx);
3145                         while (word1) {
3146                             if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3147                                 && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2))
3148                                 && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3149                                 word2 = word1;
3150                                 if (word0) {
3151                                     word0->next = word1->next;
3152                                 } else {
3153                                     pool->setPool(baseIdx, word1->next);
3154                                 }
3155                                 word1 = word1->next;
3156                                 word2->next = nullptr;
3157                                 blk->addWord(word2);
3158                                 if (word2->base < minBase) {
3159                                     minBase = word2->base;
3160                                 } else if (word2->base > maxBase) {
3161                                     maxBase = word2->base;
3162                                 }
3163                                 found = true;
3164                                 break;
3165                             } else {
3166                                 word0 = word1;
3167                                 word1 = word1->next;
3168                             }
3169                         }
3170                     }
3171                 }
3172 
3173                 // scan down the right side of the block, looking for words
3174                 // that are near (but not overlapping) the block; if there are
3175                 // three or fewer, add them to the block
3176                 n = 0;
3177                 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3178                     word1 = pool->getPool(baseIdx);
3179                     while (word1) {
3180                         if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3181                             && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2))
3182                             && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3183                             ++n;
3184                             break;
3185                         }
3186                         word1 = word1->next;
3187                     }
3188                 }
3189                 if (n > 0 && n <= 3) {
3190                     for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3191                         word0 = nullptr;
3192                         word1 = pool->getPool(baseIdx);
3193                         while (word1) {
3194                             if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3195                                 && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2))
3196                                 && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3197                                 word2 = word1;
3198                                 if (word0) {
3199                                     word0->next = word1->next;
3200                                 } else {
3201                                     pool->setPool(baseIdx, word1->next);
3202                                 }
3203                                 word1 = word1->next;
3204                                 word2->next = nullptr;
3205                                 blk->addWord(word2);
3206                                 if (word2->base < minBase) {
3207                                     minBase = word2->base;
3208                                 } else if (word2->base > maxBase) {
3209                                     maxBase = word2->base;
3210                                 }
3211                                 found = true;
3212                                 break;
3213                             } else {
3214                                 word0 = word1;
3215                                 word1 = word1->next;
3216                             }
3217                         }
3218                     }
3219                 }
3220 
3221             } while (found);
3222 
3223             //~ need to compute the primary writing mode (horiz/vert) in
3224             //~ addition to primary rotation
3225 
3226             // coalesce the block, and add it to the list
3227             blk->coalesce(uMap, fixedPitch);
3228             if (lastBlk) {
3229                 lastBlk->next = blk;
3230             } else {
3231                 blkList = blk;
3232             }
3233             lastBlk = blk;
3234             count[rot] += blk->charCount;
3235             ++nBlocks;
3236         }
3237 
3238         if (count[rot] > count[primaryRot]) {
3239             primaryRot = rot;
3240         }
3241     }
3242 
3243 #if 0 // for debugging
3244   printf("*** rotation ***\n");
3245   for (rot = 0; rot < 4; ++rot) {
3246     printf("  %d: %6d\n", rot, count[rot]);
3247   }
3248   printf("  primary rot = %d\n", primaryRot);
3249   printf("\n");
3250 #endif
3251 
3252 #if 0 // for debugging
3253   printf("*** blocks ***\n");
3254   for (blk = blkList; blk; blk = blk->next) {
3255     printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
3256 	   blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
3257     for (line = blk->lines; line; line = line->next) {
3258       printf("  line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
3259 	     line->xMin, line->xMax, line->yMin, line->yMax, line->base);
3260       for (word0 = line->words; word0; word0 = word0->next) {
3261 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3262 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3263 	       word0->base, word0->fontSize, word0->spaceAfter);
3264 	for (i = 0; i < word0->len; ++i) {
3265 	  fputc(word0->text[i] & 0xff, stdout);
3266 	}
3267 	printf("'\n");
3268       }
3269     }
3270   }
3271   printf("\n");
3272 #endif
3273 
3274     // determine the primary direction
3275     lrCount = 0;
3276     for (blk = blkList; blk; blk = blk->next) {
3277         for (line = blk->lines; line; line = line->next) {
3278             for (word0 = line->words; word0; word0 = word0->next) {
3279                 for (int i = 0; i < word0->len; ++i) {
3280                     if (unicodeTypeL(word0->text[i])) {
3281                         ++lrCount;
3282                     } else if (unicodeTypeR(word0->text[i])) {
3283                         --lrCount;
3284                     }
3285                 }
3286             }
3287         }
3288     }
3289     primaryLR = lrCount >= 0;
3290 
3291 #if 0 // for debugging
3292   printf("*** direction ***\n");
3293   printf("lrCount = %d\n", lrCount);
3294   printf("primaryLR = %d\n", primaryLR);
3295 #endif
3296 
3297     //----- column assignment
3298 
3299     // sort blocks into xy order for column assignment
3300     if (blocks)
3301         gfree(blocks);
3302     if (physLayout && fixedPitch) {
3303 
3304         blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
3305         int i;
3306         for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
3307             blocks[i] = blk;
3308             col1 = 0; // make gcc happy
3309             switch (primaryRot) {
3310             case 0:
3311                 col1 = (int)(blk->xMin / fixedPitch + 0.5);
3312                 break;
3313             case 1:
3314                 col1 = (int)(blk->yMin / fixedPitch + 0.5);
3315                 break;
3316             case 2:
3317                 col1 = (int)((pageWidth - blk->xMax) / fixedPitch + 0.5);
3318                 break;
3319             case 3:
3320                 col1 = (int)((pageHeight - blk->yMax) / fixedPitch + 0.5);
3321                 break;
3322             }
3323             blk->col = col1;
3324             for (line = blk->lines; line; line = line->next) {
3325                 for (j = 0; j <= line->len; ++j) {
3326                     line->col[j] += col1;
3327                 }
3328             }
3329         }
3330 
3331     } else {
3332 
3333         // sort blocks into xy order for column assignment
3334         blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
3335         int i;
3336         for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
3337             blocks[i] = blk;
3338         }
3339         if (blocks) {
3340             qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
3341         }
3342 
3343         // column assignment
3344         for (i = 0; i < nBlocks; ++i) {
3345             blk0 = blocks[i];
3346             col1 = 0;
3347             for (j = 0; j < i; ++j) {
3348                 blk1 = blocks[j];
3349                 col2 = 0; // make gcc happy
3350                 switch (primaryRot) {
3351                 case 0:
3352                     if (blk0->xMin > blk1->xMax) {
3353                         col2 = blk1->col + blk1->nColumns + 3;
3354                     } else if (blk1->xMax == blk1->xMin) {
3355                         col2 = blk1->col;
3356                     } else {
3357                         col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) / (blk1->xMax - blk1->xMin)) * blk1->nColumns);
3358                     }
3359                     break;
3360                 case 1:
3361                     if (blk0->yMin > blk1->yMax) {
3362                         col2 = blk1->col + blk1->nColumns + 3;
3363                     } else if (blk1->yMax == blk1->yMin) {
3364                         col2 = blk1->col;
3365                     } else {
3366                         col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) / (blk1->yMax - blk1->yMin)) * blk1->nColumns);
3367                     }
3368                     break;
3369                 case 2:
3370                     if (blk0->xMax < blk1->xMin) {
3371                         col2 = blk1->col + blk1->nColumns + 3;
3372                     } else if (blk1->xMin == blk1->xMax) {
3373                         col2 = blk1->col;
3374                     } else {
3375                         col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) / (blk1->xMin - blk1->xMax)) * blk1->nColumns);
3376                     }
3377                     break;
3378                 case 3:
3379                     if (blk0->yMax < blk1->yMin) {
3380                         col2 = blk1->col + blk1->nColumns + 3;
3381                     } else if (blk1->yMin == blk1->yMax) {
3382                         col2 = blk1->col;
3383                     } else {
3384                         col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) / (blk1->yMin - blk1->yMax)) * blk1->nColumns);
3385                     }
3386                     break;
3387                 }
3388                 if (col2 > col1) {
3389                     col1 = col2;
3390                 }
3391             }
3392             blk0->col = col1;
3393             for (line = blk0->lines; line; line = line->next) {
3394                 for (j = 0; j <= line->len; ++j) {
3395                     line->col[j] += col1;
3396                 }
3397             }
3398         }
3399     }
3400 
3401 #if 0 // for debugging
3402   printf("*** blocks, after column assignment ***\n");
3403   for (blk = blkList; blk; blk = blk->next) {
3404     printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
3405 	   blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
3406 	   blk->nColumns);
3407     for (line = blk->lines; line; line = line->next) {
3408       printf("  line: col[0]=%d\n", line->col[0]);
3409       for (word0 = line->words; word0; word0 = word0->next) {
3410 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3411 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3412 	       word0->base, word0->fontSize, word0->spaceAfter);
3413 	for (i = 0; i < word0->len; ++i) {
3414 	  fputc(word0->text[i] & 0xff, stdout);
3415 	}
3416 	printf("'\n");
3417       }
3418     }
3419   }
3420   printf("\n");
3421 #endif
3422 
3423     //----- reading order sort
3424 
3425     // compute space on left and right sides of each block
3426     for (int i = 0; i < nBlocks; ++i) {
3427         blk0 = blocks[i];
3428         for (j = 0; j < nBlocks; ++j) {
3429             blk1 = blocks[j];
3430             if (blk1 != blk0) {
3431                 blk0->updatePriMinMax(blk1);
3432             }
3433         }
3434     }
3435 
3436 #if 0 // for debugging
3437   printf("PAGE\n");
3438 #endif
3439 
3440     int sortPos = 0;
3441     bool *visited = (bool *)gmallocn(nBlocks, sizeof(bool));
3442     for (int i = 0; i < nBlocks; i++) {
3443         visited[i] = false;
3444     }
3445 
3446     double bxMin0, byMin0, bxMin1, byMin1;
3447     int numTables = 0;
3448     int tableId = -1;
3449     int correspondenceX, correspondenceY;
3450     double xCentre1, yCentre1, xCentre2, yCentre2;
3451     double xCentre3, yCentre3, xCentre4, yCentre4;
3452     double deltaX, deltaY;
3453     TextBlock *fblk2 = nullptr, *fblk3 = nullptr, *fblk4 = nullptr;
3454 
3455     for (blk1 = blkList; blk1; blk1 = blk1->next) {
3456         blk1->ExMin = blk1->xMin;
3457         blk1->ExMax = blk1->xMax;
3458         blk1->EyMin = blk1->yMin;
3459         blk1->EyMax = blk1->yMax;
3460 
3461         bxMin0 = DBL_MAX;
3462         byMin0 = DBL_MAX;
3463         bxMin1 = DBL_MAX;
3464         byMin1 = DBL_MAX;
3465 
3466         fblk2 = nullptr;
3467         fblk3 = nullptr;
3468         fblk4 = nullptr;
3469 
3470         /*  find fblk2, fblk3 and fblk4 so that
3471          *  fblk2 is on the right of blk1 and overlap with blk1 in y axis
3472          *  fblk3 is under blk1 and overlap with blk1 in x axis
3473          *  fblk4 is under blk1 and on the right of blk1
3474          *  and they are closest to blk1
3475          */
3476         for (blk2 = blkList; blk2; blk2 = blk2->next) {
3477             if (blk2 != blk1) {
3478                 if (blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin && blk2->xMin > blk1->xMax && blk2->xMin < bxMin0) {
3479                     bxMin0 = blk2->xMin;
3480                     fblk2 = blk2;
3481                 } else if (blk2->xMin <= blk1->xMax && blk2->xMax >= blk1->xMin && blk2->yMin > blk1->yMax && blk2->yMin < byMin0) {
3482                     byMin0 = blk2->yMin;
3483                     fblk3 = blk2;
3484                 } else if (blk2->xMin > blk1->xMax && blk2->xMin < bxMin1 && blk2->yMin > blk1->yMax && blk2->yMin < byMin1) {
3485                     bxMin1 = blk2->xMin;
3486                     byMin1 = blk2->yMin;
3487                     fblk4 = blk2;
3488                 }
3489             }
3490         }
3491 
3492         /*  fblk4 can not overlap with fblk3 in x and with fblk2 in y
3493          *  fblk2 can not overlap with fblk3 in x and y
3494          *  fblk4 has to overlap with fblk3 in y and with fblk2 in x
3495          */
3496         if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) {
3497             if (((fblk3->xMin <= fblk4->xMax && fblk3->xMax >= fblk4->xMin) || (fblk2->yMin <= fblk4->yMax && fblk2->yMax >= fblk4->yMin) || (fblk2->xMin <= fblk3->xMax && fblk2->xMax >= fblk3->xMin)
3498                  || (fblk2->yMin <= fblk3->yMax && fblk2->yMax >= fblk3->yMin))
3499                 || !(fblk4->xMin <= fblk2->xMax && fblk4->xMax >= fblk2->xMin && fblk4->yMin <= fblk3->yMax && fblk4->yMax >= fblk3->yMin)) {
3500                 fblk2 = nullptr;
3501                 fblk3 = nullptr;
3502                 fblk4 = nullptr;
3503             }
3504         }
3505 
3506         // if we found any then look whether they form a table
3507         if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) {
3508             tableId = -1;
3509             correspondenceX = 0;
3510             correspondenceY = 0;
3511             deltaX = 0.0;
3512             deltaY = 0.0;
3513 
3514             if (blk1->lines && blk1->lines->words)
3515                 deltaX = blk1->lines->words->getFontSize();
3516             if (fblk2->lines && fblk2->lines->words)
3517                 deltaX = deltaX < fblk2->lines->words->getFontSize() ? deltaX : fblk2->lines->words->getFontSize();
3518             if (fblk3->lines && fblk3->lines->words)
3519                 deltaX = deltaX < fblk3->lines->words->getFontSize() ? deltaX : fblk3->lines->words->getFontSize();
3520             if (fblk4->lines && fblk4->lines->words)
3521                 deltaX = deltaX < fblk4->lines->words->getFontSize() ? deltaX : fblk4->lines->words->getFontSize();
3522 
3523             deltaY = deltaX;
3524 
3525             deltaX *= minColSpacing1;
3526             deltaY *= maxIntraLineDelta;
3527 
3528             xCentre1 = (blk1->xMax + blk1->xMin) / 2.0;
3529             yCentre1 = (blk1->yMax + blk1->yMin) / 2.0;
3530             xCentre2 = (fblk2->xMax + fblk2->xMin) / 2.0;
3531             yCentre2 = (fblk2->yMax + fblk2->yMin) / 2.0;
3532             xCentre3 = (fblk3->xMax + fblk3->xMin) / 2.0;
3533             yCentre3 = (fblk3->yMax + fblk3->yMin) / 2.0;
3534             xCentre4 = (fblk4->xMax + fblk4->xMin) / 2.0;
3535             yCentre4 = (fblk4->yMax + fblk4->yMin) / 2.0;
3536 
3537             // are blocks centrally aligned in x ?
3538             if (fabs(xCentre1 - xCentre3) <= deltaX && fabs(xCentre2 - xCentre4) <= deltaX)
3539                 correspondenceX++;
3540 
3541             // are blocks centrally aligned in y ?
3542             if (fabs(yCentre1 - yCentre2) <= deltaY && fabs(yCentre3 - yCentre4) <= deltaY)
3543                 correspondenceY++;
3544 
3545             // are blocks aligned to the left ?
3546             if (fabs(blk1->xMin - fblk3->xMin) <= deltaX && fabs(fblk2->xMin - fblk4->xMin) <= deltaX)
3547                 correspondenceX++;
3548 
3549             // are blocks aligned to the right ?
3550             if (fabs(blk1->xMax - fblk3->xMax) <= deltaX && fabs(fblk2->xMax - fblk4->xMax) <= deltaX)
3551                 correspondenceX++;
3552 
3553             // are blocks aligned to the top ?
3554             if (fabs(blk1->yMin - fblk2->yMin) <= deltaY && fabs(fblk3->yMin - fblk4->yMin) <= deltaY)
3555                 correspondenceY++;
3556 
3557             // are blocks aligned to the bottom ?
3558             if (fabs(blk1->yMax - fblk2->yMax) <= deltaY && fabs(fblk3->yMax - fblk4->yMax) <= deltaY)
3559                 correspondenceY++;
3560 
3561             // are blocks aligned in x and y ?
3562             if (correspondenceX > 0 && correspondenceY > 0) {
3563 
3564                 // find maximal tableId
3565                 tableId = tableId < fblk4->tableId ? fblk4->tableId : tableId;
3566                 tableId = tableId < fblk3->tableId ? fblk3->tableId : tableId;
3567                 tableId = tableId < fblk2->tableId ? fblk2->tableId : tableId;
3568                 tableId = tableId < blk1->tableId ? blk1->tableId : tableId;
3569 
3570                 // if the tableId is -1, then we found new table
3571                 if (tableId < 0) {
3572                     tableId = numTables;
3573                     numTables++;
3574                 }
3575 
3576                 blk1->tableId = tableId;
3577                 fblk2->tableId = tableId;
3578                 fblk3->tableId = tableId;
3579                 fblk4->tableId = tableId;
3580             }
3581         }
3582     }
3583 
3584     /*  set extended bounding boxes of all table entries
3585      *  so that they contain whole table
3586      *  (we need to process whole table size when comparing it
3587      *   with regular text blocks)
3588      */
3589     PDFRectangle *envelopes = new PDFRectangle[numTables];
3590     TextBlock **ending_blocks = new TextBlock *[numTables];
3591 
3592     for (int i = 0; i < numTables; i++) {
3593         envelopes[i].x1 = DBL_MAX;
3594         envelopes[i].x2 = DBL_MIN;
3595         envelopes[i].y1 = DBL_MAX;
3596         envelopes[i].y2 = DBL_MIN;
3597     }
3598 
3599     for (blk1 = blkList; blk1; blk1 = blk1->next) {
3600         if (blk1->tableId >= 0) {
3601             if (blk1->ExMin < envelopes[blk1->tableId].x1) {
3602                 envelopes[blk1->tableId].x1 = blk1->ExMin;
3603                 if (!blk1->page->primaryLR)
3604                     ending_blocks[blk1->tableId] = blk1;
3605             }
3606 
3607             if (blk1->ExMax > envelopes[blk1->tableId].x2) {
3608                 envelopes[blk1->tableId].x2 = blk1->ExMax;
3609                 if (blk1->page->primaryLR)
3610                     ending_blocks[blk1->tableId] = blk1;
3611             }
3612 
3613             envelopes[blk1->tableId].y1 = blk1->EyMin < envelopes[blk1->tableId].y1 ? blk1->EyMin : envelopes[blk1->tableId].y1;
3614             envelopes[blk1->tableId].y2 = blk1->EyMax > envelopes[blk1->tableId].y2 ? blk1->EyMax : envelopes[blk1->tableId].y2;
3615         }
3616     }
3617 
3618     for (blk1 = blkList; blk1; blk1 = blk1->next) {
3619         if (blk1->tableId >= 0 && blk1->xMin <= ending_blocks[blk1->tableId]->xMax && blk1->xMax >= ending_blocks[blk1->tableId]->xMin) {
3620             blk1->tableEnd = true;
3621         }
3622     }
3623 
3624     for (blk1 = blkList; blk1; blk1 = blk1->next) {
3625         if (blk1->tableId >= 0) {
3626             blk1->ExMin = envelopes[blk1->tableId].x1;
3627             blk1->ExMax = envelopes[blk1->tableId].x2;
3628             blk1->EyMin = envelopes[blk1->tableId].y1;
3629             blk1->EyMax = envelopes[blk1->tableId].y2;
3630         }
3631     }
3632     delete[] envelopes;
3633     delete[] ending_blocks;
3634 
3635     /*  set extended bounding boxes of all other blocks
3636      *  so that they extend in x without hitting neighbours
3637      */
3638     for (blk1 = blkList; blk1; blk1 = blk1->next) {
3639         if (!(blk1->tableId >= 0)) {
3640             double xMax = DBL_MAX;
3641             double xMin = DBL_MIN;
3642 
3643             for (blk2 = blkList; blk2; blk2 = blk2->next) {
3644                 if (blk2 == blk1)
3645                     continue;
3646 
3647                 if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) {
3648                     if (blk2->xMin < xMax && blk2->xMin > blk1->xMax)
3649                         xMax = blk2->xMin;
3650 
3651                     if (blk2->xMax > xMin && blk2->xMax < blk1->xMin)
3652                         xMin = blk2->xMax;
3653                 }
3654             }
3655 
3656             for (blk2 = blkList; blk2; blk2 = blk2->next) {
3657                 if (blk2 == blk1)
3658                     continue;
3659 
3660                 if (blk2->xMax > blk1->ExMax && blk2->xMax <= xMax && blk2->yMin >= blk1->yMax) {
3661                     blk1->ExMax = blk2->xMax;
3662                 }
3663 
3664                 if (blk2->xMin < blk1->ExMin && blk2->xMin >= xMin && blk2->yMin >= blk1->yMax)
3665                     blk1->ExMin = blk2->xMin;
3666             }
3667         }
3668     }
3669 
3670     int i = -1;
3671     for (blk1 = blkList; blk1; blk1 = blk1->next) {
3672         i++;
3673         sortPos = blk1->visitDepthFirst(blkList, i, blocks, sortPos, visited);
3674     }
3675     if (visited) {
3676         gfree(visited);
3677     }
3678 
3679 #if 0 // for debugging
3680   printf("*** blocks, after ro sort ***\n");
3681   for (i = 0; i < nBlocks; ++i) {
3682     blk = blocks[i];
3683     printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
3684 	   blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
3685 	   blk->priMin, blk->priMax);
3686     for (line = blk->lines; line; line = line->next) {
3687       printf("  line:\n");
3688       for (word0 = line->words; word0; word0 = word0->next) {
3689 	printf("    word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3690 	       word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3691 	       word0->base, word0->fontSize, word0->spaceAfter);
3692 	for (j = 0; j < word0->len; ++j) {
3693 	  fputc(word0->text[j] & 0xff, stdout);
3694 	}
3695 	printf("'\n");
3696       }
3697     }
3698   }
3699   printf("\n");
3700   fflush(stdout);
3701 #endif
3702 
3703     // build the flows
3704     //~ this needs to be adjusted for writing mode (vertical text)
3705     //~ this also needs to account for right-to-left column ordering
3706     while (flows) {
3707         flow = flows;
3708         flows = flows->next;
3709         delete flow;
3710     }
3711     flow = nullptr;
3712     flows = lastFlow = nullptr;
3713     // assume blocks are already in reading order,
3714     // and construct flows accordingly.
3715     for (i = 0; i < nBlocks; i++) {
3716         blk = blocks[i];
3717         blk->next = nullptr;
3718         if (flow) {
3719             blk1 = blocks[i - 1];
3720             blkSpace = maxBlockSpacing * blk1->lines->words->fontSize;
3721             if (blk1->secondaryDelta(blk) <= blkSpace && blk->isBelow(blk1) && flow->blockFits(blk, blk1)) {
3722                 flow->addBlock(blk);
3723                 continue;
3724             }
3725         }
3726         flow = new TextFlow(this, blk);
3727         if (lastFlow) {
3728             lastFlow->next = flow;
3729         } else {
3730             flows = flow;
3731         }
3732         lastFlow = flow;
3733     }
3734 
3735 #if 0 // for debugging
3736   printf("*** flows ***\n");
3737   for (flow = flows; flow; flow = flow->next) {
3738     printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
3739 	   flow->xMin, flow->xMax, flow->yMin, flow->yMax,
3740 	   flow->priMin, flow->priMax);
3741     for (blk = flow->blocks; blk; blk = blk->next) {
3742       printf("  block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
3743 	     blk->rot, blk->ExMin, blk->ExMax, blk->EyMin, blk->EyMax,
3744 	     blk->priMin, blk->priMax);
3745       for (line = blk->lines; line; line = line->next) {
3746 	printf("    line:\n");
3747 	for (word0 = line->words; word0; word0 = word0->next) {
3748 	  printf("      word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3749 		 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3750 		 word0->base, word0->fontSize, word0->spaceAfter);
3751 	  for (i = 0; i < word0->len; ++i) {
3752 	    fputc(word0->text[i] & 0xff, stdout);
3753 	  }
3754 	  printf("'\n");
3755 	}
3756       }
3757     }
3758   }
3759   printf("\n");
3760 #endif
3761 }
3762 
adjustRotation(TextLine * line,int start,int end,double * xMin,double * xMax,double * yMin,double * yMax)3763 void TextPage::adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax)
3764 {
3765     switch (line->rot) {
3766     case 0:
3767         *xMin = line->edge[start];
3768         *xMax = line->edge[end];
3769         *yMin = line->yMin;
3770         *yMax = line->yMax;
3771         break;
3772     case 1:
3773         *xMin = line->xMin;
3774         *xMax = line->xMax;
3775         *yMin = line->edge[start];
3776         *yMax = line->edge[end];
3777         break;
3778     case 2:
3779         *xMin = line->edge[end];
3780         *xMax = line->edge[start];
3781         *yMin = line->yMin;
3782         *yMax = line->yMax;
3783         break;
3784     case 3:
3785         *xMin = line->xMin;
3786         *xMax = line->xMax;
3787         *yMin = line->edge[end];
3788         *yMax = line->edge[start];
3789         break;
3790     }
3791 }
3792 
findText(const Unicode * s,int len,bool startAtTop,bool stopAtBottom,bool startAtLast,bool stopAtLast,bool caseSensitive,bool backward,bool wholeWord,double * xMin,double * yMin,double * xMax,double * yMax)3793 bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax)
3794 {
3795     return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, false, false, backward, wholeWord, xMin, yMin, xMax, yMax, nullptr, nullptr);
3796 }
3797 
findText(const Unicode * s,int len,bool startAtTop,bool stopAtBottom,bool startAtLast,bool stopAtLast,bool caseSensitive,bool ignoreDiacritics,bool backward,bool wholeWord,double * xMin,double * yMin,double * xMax,double * yMax)3798 bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax,
3799                         double *yMax)
3800 {
3801     return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, ignoreDiacritics, false, backward, wholeWord, xMin, yMin, xMax, yMax, nullptr, nullptr);
3802 }
3803 
findText(const Unicode * s,int len,bool startAtTop,bool stopAtBottom,bool startAtLast,bool stopAtLast,bool caseSensitive,bool ignoreDiacritics,bool matchAcrossLines,bool backward,bool wholeWord,double * xMin,double * yMin,double * xMax,double * yMax,PDFRectangle * continueMatch,bool * ignoredHyphen)3804 bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin,
3805                         double *yMin, double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen)
3806 {
3807     TextBlock *blk;
3808     TextLine *line;
3809     Unicode *s2, *txt, *reordered;
3810     Unicode *p;
3811     Unicode *nextline;
3812     int nextline_len;
3813     bool nextlineAfterHyphen = false;
3814     int txtSize, m, i, j, k;
3815     double xStart, yStart, xStop, yStop;
3816     double xMin0, yMin0, xMax0, yMax0;
3817     double xMin1, yMin1, xMax1, yMax1;
3818     double xMin2, yMin2, xMax2, yMax2;
3819     bool found;
3820 
3821     if (len == 0) {
3822         return false;
3823     }
3824 
3825     if (rawOrder) {
3826         return false;
3827     }
3828 
3829     if (matchAcrossLines && backward) {
3830         // matchAcrossLines is unimplemented for backward search
3831         matchAcrossLines = false;
3832     }
3833 
3834     // handle right-to-left text
3835     reordered = (Unicode *)gmallocn(len, sizeof(Unicode));
3836     reorderText(s, len, nullptr, primaryLR, nullptr, reordered);
3837 
3838     // normalize the search string
3839     s2 = unicodeNormalizeNFKC(reordered, len, &len, nullptr);
3840 
3841     // if search string is not pure ascii then don't
3842     // use ignoreDiacritics (as they won't match)
3843     if (!caseSensitive) {
3844         // convert the search string to uppercase
3845         for (i = 0; i < len; ++i) {
3846             s2[i] = unicodeToUpper(s2[i]);
3847             if (ignoreDiacritics && !isAscii7(s2[i]))
3848                 ignoreDiacritics = false;
3849         }
3850     } else if (ignoreDiacritics) {
3851         for (i = 0; i < len; ++i) {
3852             if (!isAscii7(s2[i])) {
3853                 ignoreDiacritics = false;
3854                 break;
3855             }
3856         }
3857     }
3858 
3859     txt = nullptr;
3860     txtSize = 0;
3861 
3862     xStart = yStart = xStop = yStop = 0;
3863     if (startAtLast && haveLastFind) {
3864         xStart = lastFindXMin;
3865         yStart = lastFindYMin;
3866     } else if (!startAtTop) {
3867         xStart = *xMin;
3868         yStart = *yMin;
3869     }
3870     if (stopAtLast && haveLastFind) {
3871         xStop = lastFindXMin;
3872         yStop = lastFindYMin;
3873     } else if (!stopAtBottom) {
3874         xStop = *xMax;
3875         yStop = *yMax;
3876     }
3877 
3878     found = false;
3879     xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3880     xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3881 
3882     for (i = backward ? nBlocks - 1 : 0; backward ? i >= 0 : i < nBlocks; i += backward ? -1 : 1) {
3883         blk = blocks[i];
3884 
3885         // check: is the block above the top limit?
3886         // (this only works if the page's primary rotation is zero --
3887         // otherwise the blocks won't be sorted in the useful order)
3888         if (!startAtTop && primaryRot == 0 && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
3889             continue;
3890         }
3891 
3892         // check: is the block below the bottom limit?
3893         // (this only works if the page's primary rotation is zero --
3894         // otherwise the blocks won't be sorted in the useful order)
3895         if (!stopAtBottom && primaryRot == 0 && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
3896             break;
3897         }
3898 
3899         for (line = blk->lines; line; line = line->next) {
3900 
3901             // check: is the line above the top limit?
3902             // (this only works if the page's primary rotation is zero --
3903             // otherwise the lines won't be sorted in the useful order)
3904             if (!startAtTop && primaryRot == 0 && (backward ? line->yMin > yStart : line->yMin < yStart)) {
3905                 continue;
3906             }
3907 
3908             // check: is the line below the bottom limit?
3909             // (this only works if the page's primary rotation is zero --
3910             // otherwise the lines won't be sorted in the useful order)
3911             if (!stopAtBottom && primaryRot == 0 && (backward ? line->yMin < yStop : line->yMin > yStop)) {
3912                 continue;
3913             }
3914 
3915             if (!line->normalized)
3916                 line->normalized = unicodeNormalizeNFKC(line->text, line->len, &line->normalized_len, &line->normalized_idx, true);
3917 
3918             if (matchAcrossLines && line->next && !line->next->normalized)
3919                 line->next->normalized = unicodeNormalizeNFKC(line->next->text, line->next->len, &line->next->normalized_len, &line->next->normalized_idx, true);
3920             nextline = nullptr;
3921             nextline_len = 0;
3922 
3923             // convert the line to uppercase
3924             m = line->normalized_len;
3925 
3926             if (ignoreDiacritics) {
3927                 if (!line->ascii_translation)
3928                     unicodeToAscii7(line->normalized, line->normalized_len, &line->ascii_translation, &line->ascii_len, line->normalized_idx, &line->ascii_idx);
3929                 if (line->ascii_len)
3930                     m = line->ascii_len;
3931                 else
3932                     ignoreDiacritics = false;
3933 
3934                 if (matchAcrossLines && line->next && !line->next->ascii_translation)
3935                     unicodeToAscii7(line->next->normalized, line->next->normalized_len, &line->next->ascii_translation, &line->next->ascii_len, line->next->normalized_idx, &line->next->ascii_idx);
3936             }
3937             if (!caseSensitive) {
3938                 if (m > txtSize) {
3939                     txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
3940                     txtSize = m;
3941                 }
3942                 for (k = 0; k < m; ++k) {
3943                     if (ignoreDiacritics)
3944                         txt[k] = unicodeToUpper(line->ascii_translation[k]);
3945                     else
3946                         txt[k] = unicodeToUpper(line->normalized[k]);
3947                 }
3948                 if (matchAcrossLines && line->next) {
3949                     nextline_len = ignoreDiacritics ? line->next->ascii_len : line->next->normalized_len;
3950                     nextline = (Unicode *)gmallocn(nextline_len, sizeof(Unicode));
3951                     for (k = 0; k < nextline_len; ++k) {
3952                         nextline[k] = ignoreDiacritics ? unicodeToUpper(line->next->ascii_translation[k]) : unicodeToUpper(line->next->normalized[k]);
3953                     }
3954                 }
3955             } else {
3956                 if (ignoreDiacritics)
3957                     txt = line->ascii_translation;
3958                 else
3959                     txt = line->normalized;
3960 
3961                 if (matchAcrossLines && line->next) {
3962                     nextline_len = ignoreDiacritics ? line->next->ascii_len : line->next->normalized_len;
3963                     nextline = ignoreDiacritics ? line->next->ascii_translation : line->next->normalized;
3964                 }
3965             }
3966 
3967             // search each position in this line
3968             j = backward ? m - len : 0;
3969             p = txt + j;
3970             while (backward ? j >= 0 : j <= m - (nextline ? 1 : len)) {
3971                 bool wholeWordStartIsOk, wholeWordEndIsOk;
3972                 if (wholeWord) {
3973                     wholeWordStartIsOk = j == 0 || !unicodeTypeAlphaNum(txt[j - 1]);
3974                     if (nextline)
3975                         wholeWordEndIsOk = true; // word end may be in next line, so we'll check it later
3976                     else
3977                         wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(txt[j + len]);
3978                 }
3979                 if (!wholeWord || (wholeWordStartIsOk && wholeWordEndIsOk)) {
3980                     int n = 0;
3981                     bool spaceConsumedByNewline = false;
3982                     bool found_it;
3983 
3984                     // compare the strings
3985                     for (k = 0; k < len; ++k) {
3986                         bool last_char_of_line = j + k == m - 1;
3987                         bool last_char_of_search_term = k == len - 1;
3988 
3989                         if (p[k] != s2[k] || (nextline && last_char_of_line && !last_char_of_search_term)) {
3990                             // now check if the comparison failed at the end-of-line hyphen,
3991                             // and if so, keep on comparing at the next line
3992                             nextlineAfterHyphen = false;
3993 
3994                             if (s2[k] == p[k]) {
3995                                 if (p[k] != (Unicode)'-' && !UnicodeIsWhitespace(s2[k + 1])) {
3996                                     break;
3997                                 }
3998                                 k++;
3999                             } else if (p[k] != (Unicode)'-' || UnicodeIsWhitespace(s2[k]))
4000                                 break;
4001                             else
4002                                 nextlineAfterHyphen = true;
4003 
4004                             for (; n < nextline_len && k < len; ++k, ++n) {
4005                                 if (nextline[n] != s2[k]) {
4006                                     if (!spaceConsumedByNewline && !n && UnicodeIsWhitespace(s2[k])) {
4007                                         n = -1;
4008                                         spaceConsumedByNewline = true;
4009                                         continue;
4010                                     }
4011                                     break;
4012                                 }
4013                             }
4014                             break;
4015                         }
4016                     }
4017 
4018                     found_it = k == len;
4019                     if (found_it && nextline && wholeWord) { // check word end for nextline case
4020                         if (n) // Match ended at next line
4021                             wholeWordEndIsOk = n == nextline_len || !unicodeTypeAlphaNum(nextline[n]);
4022                         else // Match ended on same line
4023                             wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(txt[j + len]);
4024 
4025                         if (!wholeWordEndIsOk)
4026                             found_it = false;
4027                     }
4028                     // found it
4029                     if (found_it) {
4030                         bool nextLineMatch = (bool)n;
4031                         if (spaceConsumedByNewline)
4032                             k--;
4033                         // where s2 matches a subsequence of a compatibility equivalence
4034                         // decomposition, highlight the entire glyph, since we don't know
4035                         // the internal layout of subglyph components
4036                         int normStart, normAfterEnd;
4037                         if (ignoreDiacritics) {
4038                             normStart = line->ascii_idx[j];
4039                             if (nextline)
4040                                 normAfterEnd = line->ascii_idx[j + k - n];
4041                             else
4042                                 normAfterEnd = line->ascii_idx[j + len - 1] + 1;
4043                         } else {
4044                             normStart = line->normalized_idx[j];
4045                             if (nextline)
4046                                 normAfterEnd = line->normalized_idx[j + k - n];
4047                             else
4048                                 normAfterEnd = line->normalized_idx[j + len - 1] + 1;
4049                         }
4050 
4051                         adjustRotation(line, normStart, normAfterEnd, &xMin1, &xMax1, &yMin1, &yMax1);
4052 
4053                         if (backward) {
4054                             if ((startAtTop || yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) && (stopAtBottom || yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
4055                                 if (!found || yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
4056                                     xMin0 = xMin1;
4057                                     xMax0 = xMax1;
4058                                     yMin0 = yMin1;
4059                                     yMax0 = yMax1;
4060                                     found = true;
4061                                 }
4062                             }
4063                         } else {
4064                             if ((startAtTop || yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) && (stopAtBottom || yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
4065                                 if (!found || yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
4066                                     xMin0 = xMin1;
4067                                     xMax0 = xMax1;
4068                                     yMin0 = yMin1;
4069                                     yMax0 = yMax1;
4070                                     found = true;
4071                                     if (nextLineMatch) { // set the out parameters
4072                                         if (ignoredHyphen)
4073                                             *ignoredHyphen = nextlineAfterHyphen;
4074 
4075                                         if (continueMatch) {
4076                                             adjustRotation(line->next, 0, n, &xMin2, &xMax2, &yMin2, &yMax2);
4077                                             continueMatch->x1 = xMin2;
4078                                             continueMatch->y1 = yMax2;
4079                                             continueMatch->x2 = xMax2;
4080                                             continueMatch->y2 = yMin2;
4081                                         }
4082                                     }
4083                                 }
4084                             }
4085                         }
4086                     }
4087                 }
4088                 if (backward) {
4089                     --j;
4090                     --p;
4091                 } else {
4092                     ++j;
4093                     ++p;
4094                 }
4095             }
4096 
4097             if (nextline && nextline != line->next->ascii_translation && nextline != line->next->normalized) {
4098                 gfree(nextline);
4099             }
4100         }
4101     }
4102 
4103     gfree(s2);
4104     gfree(reordered);
4105     if (!caseSensitive) {
4106         gfree(txt);
4107     }
4108 
4109     if (found) {
4110         *xMin = xMin0;
4111         *xMax = xMax0;
4112         *yMin = yMin0;
4113         *yMax = yMax0;
4114         lastFindXMin = xMin0;
4115         lastFindYMin = yMin0;
4116         haveLastFind = true;
4117         return true;
4118     }
4119 
4120     return false;
4121 }
4122 
getText(double xMin,double yMin,double xMax,double yMax,EndOfLineKind textEOL) const4123 GooString *TextPage::getText(double xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const
4124 {
4125     GooString *s;
4126     const UnicodeMap *uMap;
4127     TextBlock *blk;
4128     TextLine *line;
4129     TextLineFrag *frags;
4130     int nFrags, fragsSize;
4131     TextLineFrag *frag;
4132     char space[8], eol[16];
4133     int spaceLen, eolLen;
4134     int lastRot;
4135     double x, y, delta;
4136     int col, idx0, idx1, i, j;
4137     bool multiLine, oneRot;
4138 
4139     s = new GooString();
4140 
4141     // get the output encoding
4142     if (!(uMap = globalParams->getTextEncoding())) {
4143         return s;
4144     }
4145 
4146     if (rawOrder) {
4147         TextWord *word;
4148         char mbc[16];
4149         int mbc_len;
4150 
4151         for (word = rawWords; word && word <= rawLastWord; word = word->next) {
4152             for (j = 0; j < word->getLength(); ++j) {
4153                 double gXMin, gXMax, gYMin, gYMax;
4154                 word->getCharBBox(j, &gXMin, &gYMin, &gXMax, &gYMax);
4155                 if (xMin <= gXMin && gXMax <= xMax && yMin <= gYMin && gYMax <= yMax) {
4156                     mbc_len = uMap->mapUnicode(*(word->getChar(j)), mbc, sizeof(mbc));
4157                     s->append(mbc, mbc_len);
4158                 }
4159             }
4160         }
4161         return s;
4162     }
4163 
4164     spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
4165     eolLen = 0; // make gcc happy
4166     switch (textEOL) {
4167     case eolUnix:
4168         eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
4169         break;
4170     case eolDOS:
4171         eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
4172         eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
4173         break;
4174     case eolMac:
4175         eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
4176         break;
4177     }
4178 
4179     //~ writing mode (horiz/vert)
4180 
4181     // collect the line fragments that are in the rectangle
4182     fragsSize = 256;
4183     frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
4184     nFrags = 0;
4185     lastRot = -1;
4186     oneRot = true;
4187     for (i = 0; i < nBlocks; ++i) {
4188         blk = blocks[i];
4189         if (xMin < blk->xMax && blk->xMin < xMax && yMin < blk->yMax && blk->yMin < yMax) {
4190             for (line = blk->lines; line; line = line->next) {
4191                 if (xMin < line->xMax && line->xMin < xMax && yMin < line->yMax && line->yMin < yMax) {
4192                     idx0 = idx1 = -1;
4193                     switch (line->rot) {
4194                     case 0:
4195                         y = 0.5 * (line->yMin + line->yMax);
4196                         if (yMin < y && y < yMax) {
4197                             j = 0;
4198                             while (j < line->len) {
4199                                 if (0.5 * (line->edge[j] + line->edge[j + 1]) > xMin) {
4200                                     idx0 = j;
4201                                     break;
4202                                 }
4203                                 ++j;
4204                             }
4205                             j = line->len - 1;
4206                             while (j >= 0) {
4207                                 if (0.5 * (line->edge[j] + line->edge[j + 1]) < xMax) {
4208                                     idx1 = j;
4209                                     break;
4210                                 }
4211                                 --j;
4212                             }
4213                         }
4214                         break;
4215                     case 1:
4216                         x = 0.5 * (line->xMin + line->xMax);
4217                         if (xMin < x && x < xMax) {
4218                             j = 0;
4219                             while (j < line->len) {
4220                                 if (0.5 * (line->edge[j] + line->edge[j + 1]) > yMin) {
4221                                     idx0 = j;
4222                                     break;
4223                                 }
4224                                 ++j;
4225                             }
4226                             j = line->len - 1;
4227                             while (j >= 0) {
4228                                 if (0.5 * (line->edge[j] + line->edge[j + 1]) < yMax) {
4229                                     idx1 = j;
4230                                     break;
4231                                 }
4232                                 --j;
4233                             }
4234                         }
4235                         break;
4236                     case 2:
4237                         y = 0.5 * (line->yMin + line->yMax);
4238                         if (yMin < y && y < yMax) {
4239                             j = 0;
4240                             while (j < line->len) {
4241                                 if (0.5 * (line->edge[j] + line->edge[j + 1]) < xMax) {
4242                                     idx0 = j;
4243                                     break;
4244                                 }
4245                                 ++j;
4246                             }
4247                             j = line->len - 1;
4248                             while (j >= 0) {
4249                                 if (0.5 * (line->edge[j] + line->edge[j + 1]) > xMin) {
4250                                     idx1 = j;
4251                                     break;
4252                                 }
4253                                 --j;
4254                             }
4255                         }
4256                         break;
4257                     case 3:
4258                         x = 0.5 * (line->xMin + line->xMax);
4259                         if (xMin < x && x < xMax) {
4260                             j = 0;
4261                             while (j < line->len) {
4262                                 if (0.5 * (line->edge[j] + line->edge[j + 1]) < yMax) {
4263                                     idx0 = j;
4264                                     break;
4265                                 }
4266                                 ++j;
4267                             }
4268                             j = line->len - 1;
4269                             while (j >= 0) {
4270                                 if (0.5 * (line->edge[j] + line->edge[j + 1]) > yMin) {
4271                                     idx1 = j;
4272                                     break;
4273                                 }
4274                                 --j;
4275                             }
4276                         }
4277                         break;
4278                     }
4279                     if (idx0 >= 0 && idx1 >= 0) {
4280                         if (nFrags == fragsSize) {
4281                             fragsSize *= 2;
4282                             frags = (TextLineFrag *)greallocn(frags, fragsSize, sizeof(TextLineFrag));
4283                         }
4284                         frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
4285                         ++nFrags;
4286                         if (lastRot >= 0 && line->rot != lastRot) {
4287                             oneRot = false;
4288                         }
4289                         lastRot = line->rot;
4290                     }
4291                 }
4292             }
4293         }
4294     }
4295 
4296     // sort the fragments and generate the string
4297     if (nFrags > 0) {
4298 
4299         for (i = 0; i < nFrags; ++i) {
4300             frags[i].computeCoords(oneRot);
4301         }
4302         assignColumns(frags, nFrags, oneRot);
4303 
4304         // if all lines in the region have the same rotation, use it;
4305         // otherwise, use the page's primary rotation
4306         if (oneRot) {
4307             qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXLineRot);
4308         } else {
4309             qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
4310         }
4311         i = 0;
4312         while (i < nFrags) {
4313             delta = maxIntraLineDelta * frags[i].line->words->fontSize;
4314             for (j = i + 1; j < nFrags && fabs(frags[j].base - frags[i].base) < delta; ++j)
4315                 ;
4316             qsort(frags + i, j - i, sizeof(TextLineFrag), oneRot ? &TextLineFrag::cmpXYColumnLineRot : &TextLineFrag::cmpXYColumnPrimaryRot);
4317             i = j;
4318         }
4319 
4320         col = 0;
4321         multiLine = false;
4322         for (i = 0; i < nFrags; ++i) {
4323             frag = &frags[i];
4324 
4325             // insert a return
4326             if (frag->col < col || (i > 0 && fabs(frag->base - frags[i - 1].base) > maxIntraLineDelta * frags[i - 1].line->words->fontSize)) {
4327                 s->append(eol, eolLen);
4328                 col = 0;
4329                 multiLine = true;
4330             }
4331 
4332             // column alignment
4333             for (; col < frag->col; ++col) {
4334                 s->append(space, spaceLen);
4335             }
4336 
4337             // get the fragment text
4338             col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
4339         }
4340 
4341         if (multiLine) {
4342             s->append(eol, eolLen);
4343         }
4344     }
4345 
4346     gfree(frags);
4347 
4348     return s;
4349 }
4350 
4351 class TextSelectionVisitor
4352 {
4353 public:
4354     explicit TextSelectionVisitor(TextPage *page);
4355     virtual ~TextSelectionVisitor();
4356     TextSelectionVisitor(const TextSelectionVisitor &) = delete;
4357     TextSelectionVisitor &operator=(const TextSelectionVisitor &) = delete;
4358     virtual void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) = 0;
4359     virtual void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) = 0;
4360     virtual void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) = 0;
4361 
4362 protected:
4363     TextPage *page;
4364 };
4365 
TextSelectionVisitor(TextPage * p)4366 TextSelectionVisitor::TextSelectionVisitor(TextPage *p) : page(p) { }
4367 
4368 TextSelectionVisitor::~TextSelectionVisitor() = default;
4369 
4370 class TextSelectionDumper : public TextSelectionVisitor
4371 {
4372 public:
4373     explicit TextSelectionDumper(TextPage *page);
4374     ~TextSelectionDumper() override;
4375 
visitBlock(TextBlock * block,TextLine * begin,TextLine * end,const PDFRectangle * selection)4376     void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {};
4377     void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override;
4378     void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override;
4379     void endPage();
4380 
4381     GooString *getText();
4382     std::vector<TextWordSelection *> **takeWordList(int *nLines);
4383 
4384 private:
4385     void startLine();
4386     void finishLine();
4387 
4388     std::vector<TextWordSelection *> **lines;
4389     int nLines, linesSize;
4390     std::vector<TextWordSelection *> *words;
4391     int tableId;
4392     TextBlock *currentBlock;
4393 };
4394 
TextSelectionDumper(TextPage * p)4395 TextSelectionDumper::TextSelectionDumper(TextPage *p) : TextSelectionVisitor(p)
4396 {
4397     linesSize = 256;
4398     lines = (std::vector<TextWordSelection *> **)gmallocn(linesSize, sizeof(std::vector<TextWordSelection *> *));
4399     nLines = 0;
4400 
4401     tableId = -1;
4402     currentBlock = nullptr;
4403     words = nullptr;
4404 }
4405 
~TextSelectionDumper()4406 TextSelectionDumper::~TextSelectionDumper()
4407 {
4408     for (int i = 0; i < nLines; i++) {
4409         for (auto entry : *(lines[i])) {
4410             delete entry;
4411         }
4412         delete lines[i];
4413     }
4414     gfree(lines);
4415 }
4416 
startLine()4417 void TextSelectionDumper::startLine()
4418 {
4419     finishLine();
4420     words = new std::vector<TextWordSelection *>();
4421 }
4422 
finishLine()4423 void TextSelectionDumper::finishLine()
4424 {
4425     if (nLines == linesSize) {
4426         linesSize *= 2;
4427         lines = (std::vector<TextWordSelection *> **)grealloc(lines, linesSize * sizeof(std::vector<TextWordSelection *> *));
4428     }
4429 
4430     if (words && words->size() > 0) {
4431         // Reverse word order for RTL text. Fixes #53 for glib backend (Evince)
4432         if (!page->primaryLR)
4433             std::reverse(words->begin(), words->end());
4434 
4435         lines[nLines++] = words;
4436     } else if (words)
4437         delete words;
4438     words = nullptr;
4439 }
4440 
visitLine(TextLine * line,TextWord * begin,TextWord * end,int edge_begin,int edge_end,const PDFRectangle * selection)4441 void TextSelectionDumper::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection)
4442 {
4443     TextLineFrag frag;
4444 
4445     frag.init(line, edge_begin, edge_end - edge_begin);
4446 
4447     if (tableId >= 0 && frag.line->blk->tableId < 0) {
4448         finishLine();
4449 
4450         tableId = -1;
4451         currentBlock = nullptr;
4452     }
4453 
4454     if (frag.line->blk->tableId >= 0) { // a table
4455         if (tableId == -1) {
4456             tableId = frag.line->blk->tableId;
4457             currentBlock = frag.line->blk;
4458         }
4459 
4460         if (currentBlock == frag.line->blk) { // the same block
4461             startLine();
4462         } else { // another block
4463             if (currentBlock->tableEnd) { // previous block ended its row
4464                 startLine();
4465             }
4466             currentBlock = frag.line->blk;
4467         }
4468     } else { // not a table
4469         startLine();
4470     }
4471 }
4472 
visitWord(TextWord * word,int begin,int end,const PDFRectangle * selection)4473 void TextSelectionDumper::visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection)
4474 {
4475     words->push_back(new TextWordSelection(word, begin, end));
4476 }
4477 
endPage()4478 void TextSelectionDumper::endPage()
4479 {
4480     finishLine();
4481 }
4482 
getText()4483 GooString *TextSelectionDumper::getText()
4484 {
4485     GooString *text;
4486     int i;
4487     const UnicodeMap *uMap;
4488     char space[8], eol[16];
4489     int spaceLen, eolLen;
4490 
4491     text = new GooString();
4492 
4493     if (!(uMap = globalParams->getTextEncoding()))
4494         return text;
4495 
4496     spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
4497     eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
4498 
4499     for (i = 0; i < nLines; i++) {
4500         std::vector<TextWordSelection *> *lineWords = lines[i];
4501         for (std::size_t j = 0; j < lineWords->size(); j++) {
4502             TextWordSelection *sel = (*lineWords)[j];
4503 
4504             page->dumpFragment(sel->word->text + sel->begin, sel->end - sel->begin, uMap, text);
4505             if (j < lineWords->size() - 1 && sel->word->spaceAfter)
4506                 text->append(space, spaceLen);
4507         }
4508         if (i < nLines - 1)
4509             text->append(eol, eolLen);
4510     }
4511 
4512     return text;
4513 }
4514 
takeWordList(int * nLinesOut)4515 std::vector<TextWordSelection *> **TextSelectionDumper::takeWordList(int *nLinesOut)
4516 {
4517     std::vector<TextWordSelection *> **returnValue = lines;
4518 
4519     *nLinesOut = nLines;
4520     if (nLines == 0)
4521         return nullptr;
4522 
4523     nLines = 0;
4524     lines = nullptr;
4525 
4526     return returnValue;
4527 }
4528 
4529 class TextSelectionSizer : public TextSelectionVisitor
4530 {
4531 public:
4532     TextSelectionSizer(TextPage *page, double scale);
~TextSelectionSizer()4533     ~TextSelectionSizer() override { delete list; }
4534 
visitBlock(TextBlock * block,TextLine * begin,TextLine * end,const PDFRectangle * selection)4535     void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {};
4536     void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override;
visitWord(TextWord * word,int begin,int end,const PDFRectangle * selection)4537     void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override {};
4538 
takeRegion()4539     std::vector<PDFRectangle *> *takeRegion()
4540     {
4541         auto aux = list;
4542         list = nullptr;
4543         return aux;
4544     }
4545 
4546 private:
4547     std::vector<PDFRectangle *> *list;
4548     double scale;
4549 };
4550 
TextSelectionSizer(TextPage * p,double s)4551 TextSelectionSizer::TextSelectionSizer(TextPage *p, double s) : TextSelectionVisitor(p), scale(s)
4552 {
4553     list = new std::vector<PDFRectangle *>();
4554 }
4555 
visitLine(TextLine * line,TextWord * begin,TextWord * end,int edge_begin,int edge_end,const PDFRectangle * selection)4556 void TextSelectionSizer::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection)
4557 {
4558     PDFRectangle *rect;
4559     double x1, y1, x2, y2, margin;
4560 
4561     switch (line->rot) {
4562     default:
4563     case 0:
4564         margin = (line->yMax - line->yMin) / 8;
4565         x1 = line->edge[edge_begin];
4566         x2 = line->edge[edge_end];
4567         y1 = line->yMin - margin;
4568         y2 = line->yMax + margin;
4569         break;
4570     case 1:
4571         margin = (line->xMax - line->xMin) / 8;
4572         x1 = line->xMin - margin;
4573         x2 = line->xMax + margin;
4574         y1 = line->edge[edge_begin];
4575         y2 = line->edge[edge_end];
4576         break;
4577     case 2:
4578         margin = (line->yMax - line->yMin) / 8;
4579         x1 = line->edge[edge_end];
4580         x2 = line->edge[edge_begin];
4581         y1 = line->yMin - margin;
4582         y2 = line->yMax + margin;
4583         break;
4584     case 3:
4585         margin = (line->xMax - line->xMin) / 8;
4586         x1 = line->xMin - margin;
4587         x2 = line->xMax + margin;
4588         y1 = line->edge[edge_end];
4589         y2 = line->edge[edge_begin];
4590         break;
4591     }
4592 
4593     rect = new PDFRectangle(floor(x1 * scale), floor(y1 * scale), ceil(x2 * scale), ceil(y2 * scale));
4594     list->push_back(rect);
4595 }
4596 
4597 class TextSelectionPainter : public TextSelectionVisitor
4598 {
4599 public:
4600     TextSelectionPainter(TextPage *page, double scale, int rotation, OutputDev *out, const GfxColor *box_color, const GfxColor *glyph_color);
4601     ~TextSelectionPainter() override;
4602 
visitBlock(TextBlock * block,TextLine * begin,TextLine * end,const PDFRectangle * selection)4603     void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {};
4604     void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override;
4605     void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override;
4606     void endPage();
4607 
4608 private:
4609     OutputDev *out;
4610     const GfxColor *glyph_color;
4611     GfxState *state;
4612     std::vector<TextWordSelection *> *selectionList;
4613     Matrix ctm, ictm;
4614     bool hasGlyphLessFont();
4615 };
4616 
TextSelectionPainter(TextPage * p,double scale,int rotation,OutputDev * outA,const GfxColor * box_color,const GfxColor * glyph_colorA)4617 TextSelectionPainter::TextSelectionPainter(TextPage *p, double scale, int rotation, OutputDev *outA, const GfxColor *box_color, const GfxColor *glyph_colorA) : TextSelectionVisitor(p), out(outA), glyph_color(glyph_colorA)
4618 {
4619     PDFRectangle box(0, 0, p->pageWidth, p->pageHeight);
4620 
4621     selectionList = new std::vector<TextWordSelection *>();
4622     state = new GfxState(72 * scale, 72 * scale, &box, rotation, false);
4623 
4624     state->getCTM(&ctm);
4625     ctm.invertTo(&ictm);
4626 
4627     out->startPage(0, state, nullptr);
4628     out->setDefaultCTM(state->getCTM());
4629 
4630     state->setFillColorSpace(new GfxDeviceRGBColorSpace());
4631     state->setFillColor(box_color);
4632     out->updateFillColor(state);
4633 }
4634 
~TextSelectionPainter()4635 TextSelectionPainter::~TextSelectionPainter()
4636 {
4637     for (auto entry : *selectionList) {
4638         delete entry;
4639     }
4640     delete selectionList;
4641     delete state;
4642 }
4643 
visitLine(TextLine * line,TextWord * begin,TextWord * end,int edge_begin,int edge_end,const PDFRectangle * selection)4644 void TextSelectionPainter::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection)
4645 {
4646     double x1, y1, x2, y2, margin;
4647 
4648     switch (line->rot) {
4649     default:
4650     case 0:
4651         margin = (line->yMax - line->yMin) / 8;
4652         x1 = line->edge[edge_begin];
4653         x2 = line->edge[edge_end];
4654         y1 = line->yMin - margin;
4655         y2 = line->yMax + margin;
4656         break;
4657     case 1:
4658         margin = (line->xMax - line->xMin) / 8;
4659         x1 = line->xMin - margin;
4660         x2 = line->xMax + margin;
4661         y1 = line->edge[edge_begin];
4662         y2 = line->edge[edge_end];
4663         break;
4664     case 2:
4665         margin = (line->yMax - line->yMin) / 8;
4666         x1 = line->edge[edge_end];
4667         x2 = line->edge[edge_begin];
4668         y1 = line->yMin - margin;
4669         y2 = line->yMax + margin;
4670         break;
4671     case 3:
4672         margin = (line->xMax - line->xMin) / 8;
4673         x1 = line->xMin - margin;
4674         x2 = line->xMax + margin;
4675         y1 = line->edge[edge_end];
4676         y2 = line->edge[edge_begin];
4677         break;
4678     }
4679 
4680     ctm.transform(x1, y1, &x1, &y1);
4681     ctm.transform(x2, y2, &x2, &y2);
4682 
4683     if (x1 < x2) {
4684         x1 = floor(x1);
4685         x2 = ceil(x2);
4686     } else {
4687         x1 = ceil(x1);
4688         x2 = floor(x2);
4689     }
4690 
4691     if (y1 < y2) {
4692         y1 = floor(y1);
4693         y2 = ceil(y2);
4694     } else {
4695         y1 = ceil(y1);
4696         y2 = floor(y2);
4697     }
4698 
4699     ictm.transform(x1, y1, &x1, &y1);
4700     ictm.transform(x2, y2, &x2, &y2);
4701 
4702     state->moveTo(x1, y1);
4703     state->lineTo(x2, y1);
4704     state->lineTo(x2, y2);
4705     state->lineTo(x1, y2);
4706     state->closePath();
4707 }
4708 
visitWord(TextWord * word,int begin,int end,const PDFRectangle * selection)4709 void TextSelectionPainter::visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection)
4710 {
4711     selectionList->push_back(new TextWordSelection(word, begin, end));
4712 }
4713 
hasGlyphLessFont()4714 bool TextSelectionPainter::hasGlyphLessFont()
4715 {
4716     if (selectionList && selectionList->size()) {
4717         TextWordSelection *sel = (*selectionList)[0];
4718         return sel->word->invisible;
4719     }
4720 
4721     return false;
4722 }
4723 
endPage()4724 void TextSelectionPainter::endPage()
4725 {
4726     out->fill(state);
4727 
4728     out->saveState(state);
4729     out->clip(state);
4730 
4731     state->clearPath();
4732 
4733     state->setFillColor(glyph_color);
4734 
4735     bool usingGlyphLessFont = hasGlyphLessFont();
4736     /* Paint transparent selection when using tesseract glyphless font. Issue #157 */
4737     if (usingGlyphLessFont) {
4738         state->setFillOpacity(glyphlessSelectionOpacity);
4739     }
4740 
4741     out->updateFillColor(state);
4742 
4743     for (const TextWordSelection *sel : *selectionList) {
4744         int begin = sel->begin;
4745 
4746         while (begin < sel->end) {
4747             TextFontInfo *font = sel->word->font[begin];
4748             font->gfxFont->incRefCnt();
4749             Matrix *mat = &sel->word->textMat[begin];
4750 
4751             state->setTextMat(mat->m[0], mat->m[1], mat->m[2], mat->m[3], 0, 0);
4752             state->setFont(font->gfxFont, 1);
4753             out->updateFont(state);
4754 
4755             int fEnd = begin + 1;
4756             while (fEnd < sel->end && font->matches(sel->word->font[fEnd]) && mat->m[0] == sel->word->textMat[fEnd].m[0] && mat->m[1] == sel->word->textMat[fEnd].m[1] && mat->m[2] == sel->word->textMat[fEnd].m[2]
4757                    && mat->m[3] == sel->word->textMat[fEnd].m[3])
4758                 fEnd++;
4759 
4760             /* The only purpose of this string is to let the output device query
4761              * it's length.  Might want to change this interface later. */
4762             GooString *string = new GooString((char *)sel->word->charcode, fEnd - begin);
4763             out->beginString(state, string);
4764 
4765             if (!usingGlyphLessFont) {
4766                 for (int j = begin; j < fEnd; j++) {
4767                     if (j != begin && sel->word->charPos[j] == sel->word->charPos[j - 1])
4768                         continue;
4769                     out->drawChar(state, sel->word->textMat[j].m[4], sel->word->textMat[j].m[5], 0, 0, 0, 0, sel->word->charcode[j], 1, nullptr, 0);
4770                 }
4771             }
4772             out->endString(state);
4773             delete string;
4774             begin = fEnd;
4775         }
4776     }
4777 
4778     out->restoreState(state);
4779     out->endPage();
4780 }
4781 
visitSelection(TextSelectionVisitor * visitor,const PDFRectangle * selection,SelectionStyle style)4782 void TextWord::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
4783 {
4784     int i, begin, end;
4785     double mid, s1, s2;
4786 
4787     if (rot == 0 || rot == 2) {
4788         s1 = selection->x1;
4789         s2 = selection->x2;
4790     } else {
4791         s1 = selection->y1;
4792         s2 = selection->y2;
4793     }
4794 
4795     begin = len;
4796     end = 0;
4797     for (i = 0; i < len; i++) {
4798         mid = (edge[i] + edge[i + 1]) / 2;
4799         if (XBetweenAB(mid, s1, s2)) {
4800             if (i < begin)
4801                 begin = i;
4802 
4803             end = i + 1;
4804         }
4805     }
4806 
4807     /* Skip empty selection. */
4808     if (end <= begin)
4809         return;
4810 
4811     visitor->visitWord(this, begin, end, selection);
4812 }
4813 
visitSelection(TextSelectionVisitor * visitor,const PDFRectangle * selection,SelectionStyle style)4814 void TextLine::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
4815 {
4816     TextWord *p, *begin, *end, *current;
4817     int i, edge_begin, edge_end;
4818     PDFRectangle child_selection;
4819     double s1, s2, pMin, pMax;
4820 
4821     if (rot == 0 || rot == 2) {
4822         s1 = selection->x1;
4823         s2 = selection->x2;
4824     } else {
4825         s1 = selection->y1;
4826         s2 = selection->y2;
4827     }
4828 
4829     begin = nullptr;
4830     end = nullptr;
4831     current = nullptr;
4832     for (p = words; p != nullptr; p = p->next) {
4833         if (rot == 0 || rot == 2) {
4834             pMin = p->xMin;
4835             pMax = p->xMax;
4836         } else {
4837             pMin = p->yMin;
4838             pMax = p->yMax;
4839         }
4840 
4841         if (blk->page->primaryLR) {
4842             if (((s1 < pMax) || (s2 < pMax)) && begin == nullptr)
4843                 begin = p;
4844 
4845             if (((s1 > pMin) || (s2 > pMin)) && begin != nullptr) {
4846                 end = p->next;
4847                 current = p;
4848             }
4849         } else {
4850             if (((s1 > pMin) || (s2 > pMin)) && begin == nullptr)
4851                 begin = p;
4852 
4853             if (((s1 < pMax) || (s2 < pMax)) && begin != nullptr) {
4854                 end = p->next;
4855                 current = p;
4856             }
4857         }
4858     }
4859 
4860     if (!current)
4861         current = begin;
4862 
4863     child_selection = *selection;
4864     if (style == selectionStyleWord) {
4865         if (rot == 0 || rot == 2) {
4866             child_selection.x1 = begin ? begin->xMin : xMin;
4867             if (end && end->xMax != -1) {
4868                 child_selection.x2 = current->xMax;
4869             } else {
4870                 child_selection.x2 = xMax;
4871             }
4872         } else {
4873             child_selection.y1 = begin ? begin->yMin : yMin;
4874             if (end && end->yMax != -1) {
4875                 child_selection.y2 = current->yMax;
4876             } else {
4877                 child_selection.y2 = yMax;
4878             }
4879         }
4880     }
4881 
4882     if (rot == 0 || rot == 2) {
4883         s1 = child_selection.x1;
4884         s2 = child_selection.x2;
4885     } else {
4886         s1 = child_selection.y1;
4887         s2 = child_selection.y2;
4888     }
4889 
4890     edge_begin = len;
4891     edge_end = 0;
4892     for (i = 0; i < len; i++) {
4893         double mid = (edge[i] + edge[i + 1]) / 2;
4894         if (XBetweenAB(mid, s1, s2)) {
4895             if (i < edge_begin)
4896                 edge_begin = i;
4897 
4898             edge_end = i + 1;
4899         }
4900     }
4901 
4902     /* Skip empty selection. */
4903     if (edge_end <= edge_begin)
4904         return;
4905 
4906     visitor->visitLine(this, begin, end, edge_begin, edge_end, &child_selection);
4907 
4908     for (p = begin; p != end; p = p->next)
4909         p->visitSelection(visitor, &child_selection, style);
4910 }
4911 
visitSelection(TextSelectionVisitor * visitor,const PDFRectangle * selection,SelectionStyle style)4912 void TextBlock::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
4913 {
4914     PDFRectangle child_selection;
4915     double x[2], y[2], d, best_d[2];
4916     TextLine *p, *best_line[2];
4917     int i, count = 0, best_count[2], start, stop;
4918     bool all[2];
4919 
4920     x[0] = selection->x1;
4921     y[0] = selection->y1;
4922     x[1] = selection->x2;
4923     y[1] = selection->y2;
4924 
4925     for (i = 0; i < 2; i++) {
4926         // the first/last lines are often not nearest
4927         // the corners, so we have to force them to be
4928         // selected when the selection runs outside this
4929         // block.
4930         if (page->primaryLR) {
4931             all[i] = x[i] >= this->xMax && y[i] >= this->yMax;
4932             if (x[i] <= this->xMin && y[i] <= this->yMin) {
4933                 best_line[i] = this->lines;
4934                 best_count[i] = 1;
4935             } else {
4936                 best_line[i] = nullptr;
4937                 best_count[i] = 0;
4938             }
4939         } else {
4940             all[i] = x[i] <= this->xMin && y[i] >= this->yMax;
4941             if (x[i] >= this->xMax && y[i] <= this->yMin) {
4942                 best_line[i] = this->lines;
4943                 best_count[i] = 1;
4944             } else {
4945                 best_line[i] = nullptr;
4946                 best_count[i] = 0;
4947             }
4948         }
4949         best_d[i] = 0;
4950     }
4951 
4952     // find the nearest line to the selection points
4953     // using the manhattan distance.
4954     for (p = this->lines; p; p = p->next) {
4955         count++;
4956         for (i = 0; i < 2; i++) {
4957             d = fmax(p->xMin - x[i], 0.0) + fmax(x[i] - p->xMax, 0.0) + fmax(p->yMin - y[i], 0.0) + fmax(y[i] - p->yMax, 0.0);
4958             if (!best_line[i] || all[i] || d < best_d[i]) {
4959                 best_line[i] = p;
4960                 best_count[i] = count;
4961                 best_d[i] = d;
4962             }
4963         }
4964     }
4965     // assert: best is always set.
4966     if (!best_line[0] || !best_line[1]) {
4967         return;
4968     }
4969 
4970     // Now decide which point was first.
4971     if (best_count[0] < best_count[1] || (best_count[0] == best_count[1] && y[0] < y[1])) {
4972         start = 0;
4973         stop = 1;
4974     } else {
4975         start = 1;
4976         stop = 0;
4977     }
4978 
4979     visitor->visitBlock(this, best_line[start], best_line[stop], selection);
4980 
4981     for (p = best_line[start]; p; p = p->next) {
4982         if (page->primaryLR) {
4983             child_selection.x1 = p->xMin;
4984             child_selection.x2 = p->xMax;
4985         } else {
4986             child_selection.x1 = p->xMax;
4987             child_selection.x2 = p->xMin;
4988         }
4989         child_selection.y1 = p->yMin;
4990         child_selection.y2 = p->yMax;
4991         if (style == selectionStyleLine) {
4992             if (p == best_line[start]) {
4993                 child_selection.x1 = 0;
4994                 child_selection.y1 = 0;
4995             }
4996             if (p == best_line[stop]) {
4997                 child_selection.x2 = page->pageWidth;
4998                 child_selection.y2 = page->pageHeight;
4999             }
5000         } else {
5001             if (p == best_line[start]) {
5002                 child_selection.x1 = fmax(p->xMin, fmin(p->xMax, x[start]));
5003                 child_selection.y1 = fmax(p->yMin, fmin(p->yMax, y[start]));
5004             }
5005             if (p == best_line[stop]) {
5006                 child_selection.x2 = fmax(p->xMin, fmin(p->xMax, x[stop]));
5007                 child_selection.y2 = fmax(p->yMin, fmin(p->yMax, y[stop]));
5008             }
5009         }
5010         p->visitSelection(visitor, &child_selection, style);
5011         if (p == best_line[stop]) {
5012             return;
5013         }
5014     }
5015 }
5016 
visitSelection(TextSelectionVisitor * visitor,const PDFRectangle * selection,SelectionStyle style)5017 void TextPage::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
5018 {
5019     PDFRectangle child_selection;
5020     double x[2], y[2], d, best_d[2];
5021     double xMin, yMin, xMax, yMax;
5022     TextFlow *flow, *best_flow[2];
5023     TextBlock *blk, *best_block[2];
5024     int i, count = 0, best_count[2], start, stop;
5025 
5026     if (!flows)
5027         return;
5028 
5029     x[0] = selection->x1;
5030     y[0] = selection->y1;
5031     x[1] = selection->x2;
5032     y[1] = selection->y2;
5033 
5034     xMin = pageWidth;
5035     yMin = pageHeight;
5036     xMax = 0.0;
5037     yMax = 0.0;
5038 
5039     for (i = 0; i < 2; i++) {
5040         best_block[i] = nullptr;
5041         best_flow[i] = nullptr;
5042         best_count[i] = 0;
5043         best_d[i] = 0;
5044     }
5045 
5046     // find the nearest blocks to the selection points
5047     // using the manhattan distance.
5048     for (flow = flows; flow; flow = flow->next) {
5049         for (blk = flow->blocks; blk; blk = blk->next) {
5050             count++;
5051             // the first/last blocks in reading order are
5052             // often not the closest to the page corners;
5053             // track the corners, force those blocks to
5054             // be selected if the selection runs across
5055             // multiple pages.
5056             xMin = fmin(xMin, blk->xMin);
5057             yMin = fmin(yMin, blk->yMin);
5058             xMax = fmax(xMax, blk->xMax);
5059             yMax = fmax(yMax, blk->yMax);
5060             for (i = 0; i < 2; i++) {
5061                 d = fmax(blk->xMin - x[i], 0.0) + fmax(x[i] - blk->xMax, 0.0) + fmax(blk->yMin - y[i], 0.0) + fmax(y[i] - blk->yMax, 0.0);
5062                 if (!best_block[i] || d < best_d[i] || (!blk->next && !flow->next && x[i] >= fmin(xMax, pageWidth) && y[i] >= fmin(yMax, pageHeight))) {
5063                     best_block[i] = blk;
5064                     best_flow[i] = flow;
5065                     best_count[i] = count;
5066                     best_d[i] = d;
5067                 }
5068             }
5069         }
5070     }
5071     for (i = 0; i < 2; i++) {
5072         if (primaryLR) {
5073             if (x[i] < xMin && y[i] < yMin) {
5074                 best_block[i] = flows->blocks;
5075                 best_flow[i] = flows;
5076                 best_count[i] = 1;
5077             }
5078         } else {
5079             if (x[i] > xMax && y[i] < yMin) {
5080                 best_block[i] = flows->blocks;
5081                 best_flow[i] = flows;
5082                 best_count[i] = 1;
5083             }
5084         }
5085     }
5086     // assert: best is always set.
5087     if (!best_block[0] || !best_block[1]) {
5088         return;
5089     }
5090 
5091     // Now decide which point was first.
5092     if (best_count[0] < best_count[1] || (best_count[0] == best_count[1] && y[0] < y[1])) {
5093         start = 0;
5094         stop = 1;
5095     } else {
5096         start = 1;
5097         stop = 0;
5098     }
5099 
5100     for (flow = best_flow[start]; flow; flow = flow->next) {
5101         if (flow == best_flow[start]) {
5102             blk = best_block[start];
5103         } else {
5104             blk = flow->blocks;
5105         }
5106         for (; blk; blk = blk->next) {
5107             if (primaryLR) {
5108                 child_selection.x1 = blk->xMin;
5109                 child_selection.x2 = blk->xMax;
5110             } else {
5111                 child_selection.x1 = blk->xMax;
5112                 child_selection.x2 = blk->xMin;
5113             }
5114             child_selection.y1 = blk->yMin;
5115             child_selection.y2 = blk->yMax;
5116             if (blk == best_block[start]) {
5117                 child_selection.x1 = fmax(blk->xMin, fmin(blk->xMax, x[start]));
5118                 child_selection.y1 = fmax(blk->yMin, fmin(blk->yMax, y[start]));
5119             }
5120             if (blk == best_block[stop]) {
5121                 child_selection.x2 = fmax(blk->xMin, fmin(blk->xMax, x[stop]));
5122                 child_selection.y2 = fmax(blk->yMin, fmin(blk->yMax, y[stop]));
5123                 blk->visitSelection(visitor, &child_selection, style);
5124                 return;
5125             }
5126             blk->visitSelection(visitor, &child_selection, style);
5127         }
5128     }
5129 }
5130 
drawSelection(OutputDev * out,double scale,int rotation,const PDFRectangle * selection,SelectionStyle style,const GfxColor * glyph_color,const GfxColor * box_color)5131 void TextPage::drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color)
5132 {
5133     TextSelectionPainter painter(this, scale, rotation, out, box_color, glyph_color);
5134 
5135     visitSelection(&painter, selection, style);
5136     painter.endPage();
5137 }
5138 
getSelectionRegion(const PDFRectangle * selection,SelectionStyle style,double scale)5139 std::vector<PDFRectangle *> *TextPage::getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale)
5140 {
5141     TextSelectionSizer sizer(this, scale);
5142 
5143     visitSelection(&sizer, selection, style);
5144 
5145     return sizer.takeRegion();
5146 }
5147 
getSelectionText(const PDFRectangle * selection,SelectionStyle style)5148 GooString *TextPage::getSelectionText(const PDFRectangle *selection, SelectionStyle style)
5149 {
5150     TextSelectionDumper dumper(this);
5151 
5152     visitSelection(&dumper, selection, style);
5153     dumper.endPage();
5154 
5155     return dumper.getText();
5156 }
5157 
getSelectionWords(const PDFRectangle * selection,SelectionStyle style,int * nLines)5158 std::vector<TextWordSelection *> **TextPage::getSelectionWords(const PDFRectangle *selection, SelectionStyle style, int *nLines)
5159 {
5160     TextSelectionDumper dumper(this);
5161 
5162     visitSelection(&dumper, selection, style);
5163     dumper.endPage();
5164 
5165     return dumper.takeWordList(nLines);
5166 }
5167 
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax) const5168 bool TextPage::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const
5169 {
5170     TextBlock *blk;
5171     TextLine *line;
5172     TextWord *word;
5173     double xMin0, xMax0, yMin0, yMax0;
5174     double xMin1, xMax1, yMin1, yMax1;
5175     bool first;
5176     int i, j0, j1;
5177 
5178     if (rawOrder) {
5179         return false;
5180     }
5181 
5182     //~ this doesn't correctly handle ranges split across multiple lines
5183     //~ (the highlighted region is the bounding box of all the parts of
5184     //~ the range)
5185     first = true;
5186     xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
5187     xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
5188     for (i = 0; i < nBlocks; ++i) {
5189         blk = blocks[i];
5190         for (line = blk->lines; line; line = line->next) {
5191             for (word = line->words; word; word = word->next) {
5192                 if (pos < word->charPos[word->len] && pos + length > word->charPos[0]) {
5193                     for (j0 = 0; j0 < word->len && pos >= word->charPos[j0 + 1]; ++j0)
5194                         ;
5195                     for (j1 = word->len - 1; j1 > j0 && pos + length <= word->charPos[j1]; --j1)
5196                         ;
5197                     switch (line->rot) {
5198                     case 0:
5199                         xMin1 = word->edge[j0];
5200                         xMax1 = word->edge[j1 + 1];
5201                         yMin1 = word->yMin;
5202                         yMax1 = word->yMax;
5203                         break;
5204                     case 1:
5205                         xMin1 = word->xMin;
5206                         xMax1 = word->xMax;
5207                         yMin1 = word->edge[j0];
5208                         yMax1 = word->edge[j1 + 1];
5209                         break;
5210                     case 2:
5211                         xMin1 = word->edge[j1 + 1];
5212                         xMax1 = word->edge[j0];
5213                         yMin1 = word->yMin;
5214                         yMax1 = word->yMax;
5215                         break;
5216                     case 3:
5217                         xMin1 = word->xMin;
5218                         xMax1 = word->xMax;
5219                         yMin1 = word->edge[j1 + 1];
5220                         yMax1 = word->edge[j0];
5221                         break;
5222                     }
5223                     if (first || xMin1 < xMin0) {
5224                         xMin0 = xMin1;
5225                     }
5226                     if (first || xMax1 > xMax0) {
5227                         xMax0 = xMax1;
5228                     }
5229                     if (first || yMin1 < yMin0) {
5230                         yMin0 = yMin1;
5231                     }
5232                     if (first || yMax1 > yMax0) {
5233                         yMax0 = yMax1;
5234                     }
5235                     first = false;
5236                 }
5237             }
5238         }
5239     }
5240     if (!first) {
5241         *xMin = xMin0;
5242         *xMax = xMax0;
5243         *yMin = yMin0;
5244         *yMax = yMax0;
5245         return true;
5246     }
5247     return false;
5248 }
5249 
dump(void * outputStream,TextOutputFunc outputFunc,bool physLayout,EndOfLineKind textEOL,bool pageBreaks)5250 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayout, EndOfLineKind textEOL, bool pageBreaks)
5251 {
5252     const UnicodeMap *uMap;
5253     TextFlow *flow;
5254     TextBlock *blk;
5255     TextLine *line;
5256     TextLineFrag *frags;
5257     TextWord *word;
5258     int nFrags, fragsSize;
5259     TextLineFrag *frag;
5260     char space[8], eol[16], eop[8];
5261     int spaceLen, eolLen, eopLen;
5262     GooString *s;
5263     double delta;
5264     int col, i, j, d, n;
5265 
5266     // get the output encoding
5267     if (!(uMap = globalParams->getTextEncoding())) {
5268         return;
5269     }
5270     spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
5271     eolLen = 0; // make gcc happy
5272     switch (textEOL) {
5273     case eolUnix:
5274         eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
5275         break;
5276     case eolDOS:
5277         eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
5278         eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
5279         break;
5280     case eolMac:
5281         eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
5282         break;
5283     }
5284     eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
5285 
5286     //~ writing mode (horiz/vert)
5287 
5288     // output the page in raw (content stream) order
5289     if (rawOrder) {
5290 
5291         for (word = rawWords; word; word = word->next) {
5292             s = new GooString();
5293             dumpFragment(word->text, word->len, uMap, s);
5294             (*outputFunc)(outputStream, s->c_str(), s->getLength());
5295             delete s;
5296             if (word->next && fabs(word->next->base - word->base) < maxIntraLineDelta * word->fontSize && word->next->xMin > word->xMax - minDupBreakOverlap * word->fontSize) {
5297                 if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
5298                     (*outputFunc)(outputStream, space, spaceLen);
5299                 }
5300             } else {
5301                 (*outputFunc)(outputStream, eol, eolLen);
5302             }
5303         }
5304 
5305         // output the page, maintaining the original physical layout
5306     } else if (physLayout) {
5307 
5308         // collect the line fragments for the page and sort them
5309         fragsSize = 256;
5310         frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
5311         nFrags = 0;
5312         for (i = 0; i < nBlocks; ++i) {
5313             blk = blocks[i];
5314             for (line = blk->lines; line; line = line->next) {
5315                 if (nFrags == fragsSize) {
5316                     fragsSize *= 2;
5317                     frags = (TextLineFrag *)greallocn(frags, fragsSize, sizeof(TextLineFrag));
5318                 }
5319                 frags[nFrags].init(line, 0, line->len);
5320                 frags[nFrags].computeCoords(true);
5321                 ++nFrags;
5322             }
5323         }
5324         qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
5325         i = 0;
5326         while (i < nFrags) {
5327             delta = maxIntraLineDelta * frags[i].line->words->fontSize;
5328             for (j = i + 1; j < nFrags && fabs(frags[j].base - frags[i].base) < delta; ++j)
5329                 ;
5330             qsort(frags + i, j - i, sizeof(TextLineFrag), &TextLineFrag::cmpXYColumnPrimaryRot);
5331             i = j;
5332         }
5333 
5334 #if 0 // for debugging
5335     printf("*** line fragments ***\n");
5336     for (i = 0; i < nFrags; ++i) {
5337       frag = &frags[i];
5338       printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
5339 	     frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base);
5340       for (n = 0; n < frag->len; ++n) {
5341 	fputc(frag->line->text[frag->start + n] & 0xff, stdout);
5342       }
5343       printf("'\n");
5344     }
5345     printf("\n");
5346 #endif
5347 
5348         // generate output
5349         col = 0;
5350         for (i = 0; i < nFrags; ++i) {
5351             frag = &frags[i];
5352 
5353             // column alignment
5354             for (; col < frag->col; ++col) {
5355                 (*outputFunc)(outputStream, space, spaceLen);
5356             }
5357 
5358             // print the line
5359             s = new GooString();
5360             col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
5361             (*outputFunc)(outputStream, s->c_str(), s->getLength());
5362             delete s;
5363 
5364             // print one or more returns if necessary
5365             if (i == nFrags - 1 || frags[i + 1].col < col || fabs(frags[i + 1].base - frag->base) > maxIntraLineDelta * frag->line->words->fontSize) {
5366                 if (i < nFrags - 1) {
5367                     d = (int)((frags[i + 1].base - frag->base) / frag->line->words->fontSize);
5368                     if (d < 1) {
5369                         d = 1;
5370                     } else if (d > 5) {
5371                         d = 5;
5372                     }
5373                 } else {
5374                     d = 1;
5375                 }
5376                 for (; d > 0; --d) {
5377                     (*outputFunc)(outputStream, eol, eolLen);
5378                 }
5379                 col = 0;
5380             }
5381         }
5382 
5383         gfree(frags);
5384 
5385         // output the page, "undoing" the layout
5386     } else {
5387         for (flow = flows; flow; flow = flow->next) {
5388             for (blk = flow->blocks; blk; blk = blk->next) {
5389                 for (line = blk->lines; line; line = line->next) {
5390                     n = line->len;
5391                     if (line->hyphenated && (line->next || blk->next)) {
5392                         --n;
5393                     }
5394                     s = new GooString();
5395                     dumpFragment(line->text, n, uMap, s);
5396                     (*outputFunc)(outputStream, s->c_str(), s->getLength());
5397                     delete s;
5398                     // output a newline when a hyphen is not suppressed
5399                     if (n == line->len) {
5400                         (*outputFunc)(outputStream, eol, eolLen);
5401                     }
5402                 }
5403             }
5404             (*outputFunc)(outputStream, eol, eolLen);
5405         }
5406     }
5407 
5408     // end of page
5409     if (pageBreaks) {
5410         (*outputFunc)(outputStream, eop, eopLen);
5411     }
5412 }
5413 
setMergeCombining(bool merge)5414 void TextPage::setMergeCombining(bool merge)
5415 {
5416     mergeCombining = merge;
5417 }
5418 
assignColumns(TextLineFrag * frags,int nFrags,bool oneRot) const5419 void TextPage::assignColumns(TextLineFrag *frags, int nFrags, bool oneRot) const
5420 {
5421     TextLineFrag *frag0, *frag1;
5422     int rot, col1, col2, i, j, k;
5423 
5424     // all text in the region has the same rotation -- recompute the
5425     // column numbers based only on the text in the region
5426     if (oneRot) {
5427         qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
5428         rot = frags[0].line->rot;
5429         for (i = 0; i < nFrags; ++i) {
5430             frag0 = &frags[i];
5431             col1 = 0;
5432             for (j = 0; j < i; ++j) {
5433                 frag1 = &frags[j];
5434                 col2 = 0; // make gcc happy
5435                 switch (rot) {
5436                 case 0:
5437                     if (frag0->xMin >= frag1->xMax) {
5438                         col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1;
5439                     } else {
5440                         for (k = frag1->start; k < frag1->start + frag1->len && frag0->xMin >= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k)
5441                             ;
5442                         col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5443                     }
5444                     break;
5445                 case 1:
5446                     if (frag0->yMin >= frag1->yMax) {
5447                         col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1;
5448                     } else {
5449                         for (k = frag1->start; k < frag1->start + frag1->len && frag0->yMin >= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k)
5450                             ;
5451                         col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5452                     }
5453                     break;
5454                 case 2:
5455                     if (frag0->xMax <= frag1->xMin) {
5456                         col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1;
5457                     } else {
5458                         for (k = frag1->start; k < frag1->start + frag1->len && frag0->xMax <= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k)
5459                             ;
5460                         col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5461                     }
5462                     break;
5463                 case 3:
5464                     if (frag0->yMax <= frag1->yMin) {
5465                         col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1;
5466                     } else {
5467                         for (k = frag1->start; k < frag1->start + frag1->len && frag0->yMax <= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k)
5468                             ;
5469                         col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5470                     }
5471                     break;
5472                 }
5473                 if (col2 > col1) {
5474                     col1 = col2;
5475                 }
5476             }
5477             frag0->col = col1;
5478         }
5479 
5480         // the region includes text at different rotations -- use the
5481         // globally assigned column numbers, offset by the minimum column
5482         // number (i.e., shift everything over to column 0)
5483     } else {
5484         col1 = frags[0].col;
5485         for (i = 1; i < nFrags; ++i) {
5486             if (frags[i].col < col1) {
5487                 col1 = frags[i].col;
5488             }
5489         }
5490         for (i = 0; i < nFrags; ++i) {
5491             frags[i].col -= col1;
5492         }
5493     }
5494 }
5495 
dumpFragment(const Unicode * text,int len,const UnicodeMap * uMap,GooString * s) const5496 int TextPage::dumpFragment(const Unicode *text, int len, const UnicodeMap *uMap, GooString *s) const
5497 {
5498     if (uMap->isUnicode()) {
5499         return reorderText(text, len, uMap, primaryLR, s, nullptr);
5500     } else {
5501         int nCols = 0;
5502 
5503         char buf[8];
5504         int buflen = 0;
5505 
5506         for (int i = 0; i < len; ++i) {
5507             buflen = uMap->mapUnicode(text[i], buf, sizeof(buf));
5508             s->append(buf, buflen);
5509             nCols += buflen;
5510         }
5511 
5512         return nCols;
5513     }
5514 }
5515 
5516 #ifdef TEXTOUT_WORD_LIST
makeWordList(bool physLayout)5517 std::unique_ptr<TextWordList> TextPage::makeWordList(bool physLayout)
5518 {
5519     return std::make_unique<TextWordList>(this, physLayout);
5520 }
5521 #endif
5522 
5523 //------------------------------------------------------------------------
5524 // ActualText
5525 //------------------------------------------------------------------------
ActualText(TextPage * out)5526 ActualText::ActualText(TextPage *out)
5527 {
5528     out->incRefCnt();
5529     text = out;
5530     actualText = nullptr;
5531     actualTextNBytes = 0;
5532 }
5533 
~ActualText()5534 ActualText::~ActualText()
5535 {
5536     if (actualText)
5537         delete actualText;
5538     text->decRefCnt();
5539 }
5540 
addChar(const GfxState * state,double x,double y,double dx,double dy,CharCode c,int nBytes,const Unicode * u,int uLen)5541 void ActualText::addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen)
5542 {
5543     if (!actualText) {
5544         text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5545         return;
5546     }
5547 
5548     // Inside ActualText span.
5549     if (!actualTextNBytes) {
5550         actualTextX0 = x;
5551         actualTextY0 = y;
5552     }
5553     actualTextX1 = x + dx;
5554     actualTextY1 = y + dy;
5555     actualTextNBytes += nBytes;
5556 }
5557 
begin(const GfxState * state,const GooString * t)5558 void ActualText::begin(const GfxState *state, const GooString *t)
5559 {
5560     if (actualText)
5561         delete actualText;
5562     actualText = new GooString(t);
5563     actualTextNBytes = 0;
5564 }
5565 
end(const GfxState * state)5566 void ActualText::end(const GfxState *state)
5567 {
5568     // ActualText span closed. Output the span text and the
5569     // extents of all the glyphs inside the span
5570 
5571     if (actualTextNBytes) {
5572         Unicode *uni = nullptr;
5573         int length;
5574 
5575         // now that we have the position info for all of the text inside
5576         // the marked content span, we feed the "ActualText" back through
5577         // text->addChar()
5578         length = TextStringToUCS4(actualText->toStr(), &uni);
5579         text->addChar(state, actualTextX0, actualTextY0, actualTextX1 - actualTextX0, actualTextY1 - actualTextY0, 0, actualTextNBytes, uni, length);
5580         gfree(uni);
5581     }
5582 
5583     delete actualText;
5584     actualText = nullptr;
5585     actualTextNBytes = 0;
5586 }
5587 
5588 //------------------------------------------------------------------------
5589 // TextOutputDev
5590 //------------------------------------------------------------------------
5591 
TextOutputDev_outputToFile(void * stream,const char * text,int len)5592 static void TextOutputDev_outputToFile(void *stream, const char *text, int len)
5593 {
5594     fwrite(text, 1, len, (FILE *)stream);
5595 }
5596 
TextOutputDev(const char * fileName,bool physLayoutA,double fixedPitchA,bool rawOrderA,bool append,bool discardDiagA)5597 TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool append, bool discardDiagA)
5598 {
5599     text = nullptr;
5600     physLayout = physLayoutA;
5601     fixedPitch = physLayout ? fixedPitchA : 0;
5602     rawOrder = rawOrderA;
5603     discardDiag = discardDiagA;
5604     doHTML = false;
5605     textEOL = defaultEndOfLine();
5606     textPageBreaks = true;
5607     ok = true;
5608 
5609     // open file
5610     needClose = false;
5611     if (fileName) {
5612         if (!strcmp(fileName, "-")) {
5613             outputStream = stdout;
5614 #if defined(_WIN32) || defined(__CYGWIN__)
5615             // keep DOS from munging the end-of-line characters
5616             _setmode(fileno(stdout), O_BINARY);
5617 #endif
5618         } else if ((outputStream = openFile(fileName, append ? "ab" : "wb"))) {
5619             needClose = true;
5620         } else {
5621             error(errIO, -1, "Couldn't open text file '{0:s}'", fileName);
5622             ok = false;
5623             actualText = nullptr;
5624             return;
5625         }
5626         outputFunc = &TextOutputDev_outputToFile;
5627     } else {
5628         outputStream = nullptr;
5629     }
5630 
5631     // set up text object
5632     text = new TextPage(rawOrderA, discardDiagA);
5633     actualText = new ActualText(text);
5634 }
5635 
TextOutputDev(TextOutputFunc func,void * stream,bool physLayoutA,double fixedPitchA,bool rawOrderA,bool discardDiagA)5636 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool discardDiagA)
5637 {
5638     outputFunc = func;
5639     outputStream = stream;
5640     needClose = false;
5641     physLayout = physLayoutA;
5642     fixedPitch = physLayout ? fixedPitchA : 0;
5643     rawOrder = rawOrderA;
5644     discardDiag = discardDiagA;
5645     doHTML = false;
5646     text = new TextPage(rawOrderA, discardDiagA);
5647     actualText = new ActualText(text);
5648     textEOL = defaultEndOfLine();
5649     textPageBreaks = true;
5650     ok = true;
5651 }
5652 
~TextOutputDev()5653 TextOutputDev::~TextOutputDev()
5654 {
5655     if (needClose) {
5656         fclose((FILE *)outputStream);
5657     }
5658     if (text) {
5659         text->decRefCnt();
5660     }
5661     delete actualText;
5662 }
5663 
startPage(int pageNum,GfxState * state,XRef * xref)5664 void TextOutputDev::startPage(int pageNum, GfxState *state, XRef *xref)
5665 {
5666     text->startPage(state);
5667 }
5668 
endPage()5669 void TextOutputDev::endPage()
5670 {
5671     text->endPage();
5672     text->coalesce(physLayout, fixedPitch, doHTML);
5673     if (outputStream) {
5674         text->dump(outputStream, outputFunc, physLayout, textEOL, textPageBreaks);
5675     }
5676 }
5677 
restoreState(GfxState * state)5678 void TextOutputDev::restoreState(GfxState *state)
5679 {
5680     text->updateFont(state);
5681 }
5682 
updateFont(GfxState * state)5683 void TextOutputDev::updateFont(GfxState *state)
5684 {
5685     text->updateFont(state);
5686 }
5687 
beginString(GfxState * state,const GooString * s)5688 void TextOutputDev::beginString(GfxState *state, const GooString *s) { }
5689 
endString(GfxState * state)5690 void TextOutputDev::endString(GfxState *state) { }
5691 
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode c,int nBytes,const Unicode * u,int uLen)5692 void TextOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, const Unicode *u, int uLen)
5693 {
5694     actualText->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5695 }
5696 
incCharCount(int nChars)5697 void TextOutputDev::incCharCount(int nChars)
5698 {
5699     text->incCharCount(nChars);
5700 }
5701 
beginActualText(GfxState * state,const GooString * t)5702 void TextOutputDev::beginActualText(GfxState *state, const GooString *t)
5703 {
5704     actualText->begin(state, t);
5705 }
5706 
endActualText(GfxState * state)5707 void TextOutputDev::endActualText(GfxState *state)
5708 {
5709     actualText->end(state);
5710 }
5711 
stroke(GfxState * state)5712 void TextOutputDev::stroke(GfxState *state)
5713 {
5714     double x[2], y[2];
5715 
5716     if (!doHTML) {
5717         return;
5718     }
5719     const GfxPath *path = state->getPath();
5720     if (path->getNumSubpaths() != 1) {
5721         return;
5722     }
5723     const GfxSubpath *subpath = path->getSubpath(0);
5724     if (subpath->getNumPoints() != 2) {
5725         return;
5726     }
5727     state->transform(subpath->getX(0), subpath->getY(0), &x[0], &y[0]);
5728     state->transform(subpath->getX(1), subpath->getY(1), &x[1], &y[1]);
5729 
5730     // look for a vertical or horizontal line
5731     if (x[0] == x[1] || y[0] == y[1]) {
5732         text->addUnderline(x[0], y[0], x[1], y[1]);
5733     }
5734 }
5735 
fill(GfxState * state)5736 void TextOutputDev::fill(GfxState *state)
5737 {
5738     double x[5], y[5];
5739     double rx0, ry0, rx1, ry1, t;
5740     int i;
5741 
5742     if (!doHTML) {
5743         return;
5744     }
5745     const GfxPath *path = state->getPath();
5746     if (path->getNumSubpaths() != 1) {
5747         return;
5748     }
5749     const GfxSubpath *subpath = path->getSubpath(0);
5750     if (subpath->getNumPoints() != 5) {
5751         return;
5752     }
5753     for (i = 0; i < 5; ++i) {
5754         if (subpath->getCurve(i)) {
5755             return;
5756         }
5757         state->transform(subpath->getX(i), subpath->getY(i), &x[i], &y[i]);
5758     }
5759 
5760     // look for a rectangle
5761     if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] && x[0] == x[4] && y[0] == y[4]) {
5762         rx0 = x[0];
5763         ry0 = y[0];
5764         rx1 = x[2];
5765         ry1 = y[1];
5766     } else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] && x[0] == x[4] && y[0] == y[4]) {
5767         rx0 = x[0];
5768         ry0 = y[0];
5769         rx1 = x[1];
5770         ry1 = y[2];
5771     } else {
5772         return;
5773     }
5774     if (rx1 < rx0) {
5775         t = rx0;
5776         rx0 = rx1;
5777         rx1 = t;
5778     }
5779     if (ry1 < ry0) {
5780         t = ry0;
5781         ry0 = ry1;
5782         ry1 = t;
5783     }
5784 
5785     // skinny horizontal rectangle
5786     if (ry1 - ry0 < rx1 - rx0) {
5787         if (ry1 - ry0 < maxUnderlineWidth) {
5788             ry0 = 0.5 * (ry0 + ry1);
5789             text->addUnderline(rx0, ry0, rx1, ry0);
5790         }
5791 
5792         // skinny vertical rectangle
5793     } else {
5794         if (rx1 - rx0 < maxUnderlineWidth) {
5795             rx0 = 0.5 * (rx0 + rx1);
5796             text->addUnderline(rx0, ry0, rx0, ry1);
5797         }
5798     }
5799 }
5800 
eoFill(GfxState * state)5801 void TextOutputDev::eoFill(GfxState *state)
5802 {
5803     if (!doHTML) {
5804         return;
5805     }
5806     fill(state);
5807 }
5808 
processLink(AnnotLink * link)5809 void TextOutputDev::processLink(AnnotLink *link)
5810 {
5811     double x1, y1, x2, y2;
5812     int xMin, yMin, xMax, yMax, x, y;
5813 
5814     if (!doHTML) {
5815         return;
5816     }
5817     link->getRect(&x1, &y1, &x2, &y2);
5818     cvtUserToDev(x1, y1, &x, &y);
5819     xMin = xMax = x;
5820     yMin = yMax = y;
5821     cvtUserToDev(x1, y2, &x, &y);
5822     if (x < xMin) {
5823         xMin = x;
5824     } else if (x > xMax) {
5825         xMax = x;
5826     }
5827     if (y < yMin) {
5828         yMin = y;
5829     } else if (y > yMax) {
5830         yMax = y;
5831     }
5832     cvtUserToDev(x2, y1, &x, &y);
5833     if (x < xMin) {
5834         xMin = x;
5835     } else if (x > xMax) {
5836         xMax = x;
5837     }
5838     if (y < yMin) {
5839         yMin = y;
5840     } else if (y > yMax) {
5841         yMax = y;
5842     }
5843     cvtUserToDev(x2, y2, &x, &y);
5844     if (x < xMin) {
5845         xMin = x;
5846     } else if (x > xMax) {
5847         xMax = x;
5848     }
5849     if (y < yMin) {
5850         yMin = y;
5851     } else if (y > yMax) {
5852         yMax = y;
5853     }
5854     text->addLink(xMin, yMin, xMax, yMax, link);
5855 }
5856 
findText(const Unicode * s,int len,bool startAtTop,bool stopAtBottom,bool startAtLast,bool stopAtLast,bool caseSensitive,bool backward,bool wholeWord,double * xMin,double * yMin,double * xMax,double * yMax) const5857 bool TextOutputDev::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) const
5858 {
5859     return text->findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, backward, wholeWord, xMin, yMin, xMax, yMax);
5860 }
5861 
getText(double xMin,double yMin,double xMax,double yMax) const5862 GooString *TextOutputDev::getText(double xMin, double yMin, double xMax, double yMax) const
5863 {
5864     return text->getText(xMin, yMin, xMax, yMax, textEOL);
5865 }
5866 
drawSelection(OutputDev * out,double scale,int rotation,const PDFRectangle * selection,SelectionStyle style,const GfxColor * glyph_color,const GfxColor * box_color)5867 void TextOutputDev::drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color)
5868 {
5869     text->drawSelection(out, scale, rotation, selection, style, glyph_color, box_color);
5870 }
5871 
getSelectionRegion(const PDFRectangle * selection,SelectionStyle style,double scale)5872 std::vector<PDFRectangle *> *TextOutputDev::getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale)
5873 {
5874     return text->getSelectionRegion(selection, style, scale);
5875 }
5876 
getSelectionText(const PDFRectangle * selection,SelectionStyle style)5877 GooString *TextOutputDev::getSelectionText(const PDFRectangle *selection, SelectionStyle style)
5878 {
5879     return text->getSelectionText(selection, style);
5880 }
5881 
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax) const5882 bool TextOutputDev::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const
5883 {
5884     return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
5885 }
5886 
setMergeCombining(bool merge)5887 void TextOutputDev::setMergeCombining(bool merge)
5888 {
5889     text->setMergeCombining(merge);
5890 }
5891 
5892 #ifdef TEXTOUT_WORD_LIST
makeWordList()5893 std::unique_ptr<TextWordList> TextOutputDev::makeWordList()
5894 {
5895     return text->makeWordList(physLayout);
5896 }
5897 #endif
5898 
takeText()5899 TextPage *TextOutputDev::takeText()
5900 {
5901     TextPage *ret;
5902 
5903     ret = text;
5904     text = new TextPage(rawOrder, discardDiag);
5905     return ret;
5906 }
5907 
getFlows() const5908 const TextFlow *TextOutputDev::getFlows() const
5909 {
5910     return text->getFlows();
5911 }
5912