1 //========================================================================
2 //
3 // TextOutputDev.cc
4 //
5 // Copyright 1997-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
17 // Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru>
18 // Copyright (C) 2006-2008, 2011-2013 Carlos Garcia Campos <carlosgc@gnome.org>
19 // Copyright (C) 2006, 2007, 2013 Ed Catmur <ed@catmur.co.uk>
20 // Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net>
21 // Copyright (C) 2007, 2008, 2012, 2017 Adrian Johnson <ajohnson@redneon.com>
22 // Copyright (C) 2008 Koji Otani <sho@bbr.jp>
23 // Copyright (C) 2008, 2010-2012, 2014-2021 Albert Astals Cid <aacid@kde.org>
24 // Copyright (C) 2008 Pino Toscano <pino@kde.org>
25 // Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl>
26 // Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au>
27 // Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
28 // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
29 // Copyright (C) 2010, 2021 Marek Kasik <mkasik@redhat.com>
30 // Copyright (C) 2010, 2020 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
31 // Copyright (C) 2011 Sam Liao <phyomh@gmail.com>
32 // Copyright (C) 2012 Horst Prote <prote@fmi.uni-stuttgart.de>
33 // Copyright (C) 2012, 2013-2018 Jason Crain <jason@aquaticape.us>
34 // Copyright (C) 2012 Peter Breitenlohner <peb@mppmu.mpg.de>
35 // Copyright (C) 2013 José Aliste <jaliste@src.gnome.org>
36 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
37 // Copyright (C) 2013 Ed Catmur <ed@catmur.co.uk>
38 // Copyright (C) 2016 Khaled Hosny <khaledhosny@eglug.org>
39 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
40 // Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com>
41 // Copyright (C) 2018 Adam Reichold <adam.reichold@t-online.de>
42 // Copyright (C) 2018-2021 Nelson Benítez León <nbenitezl@gmail.com>
43 // Copyright (C) 2019 Christian Persch <chpe@src.gnome.org>
44 // Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de>
45 // Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com>
46 // Copyright (C) 2021 Peter Williams <peter@newton.cx>
47 //
48 // To see a description of the changes please see the Changelog file that
49 // came with your tarball or type make ChangeLog if you are building from git
50 //
51 //========================================================================
52
53 #include <config.h>
54
55 #include <cstdio>
56 #include <cstdlib>
57 #include <cstddef>
58 #include <cmath>
59 #include <cfloat>
60 #include <cctype>
61 #include <algorithm>
62 #if defined(_WIN32) || defined(__CYGWIN__)
63 # include <fcntl.h> // for O_BINARY
64 # include <io.h> // for _setmode
65 #endif
66 #include "goo/gfile.h"
67 #include "goo/gmem.h"
68 #include "goo/GooString.h"
69 #include "poppler-config.h"
70 #include "Error.h"
71 #include "GlobalParams.h"
72 #include "UnicodeMap.h"
73 #include "UnicodeTypeTable.h"
74 #include "Link.h"
75 #include "TextOutputDev.h"
76 #include "Page.h"
77 #include "Annot.h"
78 #include "UTF.h"
79
80 //------------------------------------------------------------------------
81 // parameters
82 //------------------------------------------------------------------------
83
84 // Each bucket in a text pool includes baselines within a range of
85 // this many points.
86 #define textPoolStep 4
87
88 // Inter-character space width which will cause addChar to start a new
89 // word.
90 #define minWordBreakSpace 0.1
91
92 // Negative inter-character space width, i.e., overlap, which will
93 // cause addChar to start a new word.
94 #define minDupBreakOverlap 0.2
95
96 // Max distance between baselines of two lines within a block, as a
97 // fraction of the font size.
98 #define maxLineSpacingDelta 1.5
99
100 // Max difference in primary font sizes on two lines in the same
101 // block. Delta1 is used when examining new lines above and below the
102 // current block; delta2 is used when examining text that overlaps the
103 // current block; delta3 is used when examining text to the left and
104 // right of the current block.
105 #define maxBlockFontSizeDelta1 0.05
106 #define maxBlockFontSizeDelta2 0.6
107 #define maxBlockFontSizeDelta3 0.2
108
109 // Max difference in font sizes inside a word.
110 #define maxWordFontSizeDelta 0.05
111
112 // Maximum distance between baselines of two words on the same line,
113 // e.g., distance between subscript or superscript and the primary
114 // baseline, as a fraction of the font size.
115 #define maxIntraLineDelta 0.5
116
117 // Minimum inter-word spacing, as a fraction of the font size. (Only
118 // used for raw ordering.)
119 #define minWordSpacing 0.15
120
121 // Maximum inter-word spacing, as a fraction of the font size.
122 #define maxWordSpacing 1.5
123
124 // Maximum horizontal spacing which will allow a word to be pulled
125 // into a block.
126 #define minColSpacing1 0.3
127
128 // Minimum spacing between columns, as a fraction of the font size.
129 #define minColSpacing2 1.0
130
131 // Maximum vertical spacing between blocks within a flow, as a
132 // multiple of the font size.
133 #define maxBlockSpacing 2.5
134
135 // Minimum spacing between characters within a word, as a fraction of
136 // the font size.
137 #define minCharSpacing -0.5
138
139 // Maximum spacing between characters within a word, as a fraction of
140 // the font size, when there is no obvious extra-wide character
141 // spacing.
142 #define maxCharSpacing 0.03
143
144 // When extra-wide character spacing is detected, the inter-character
145 // space threshold is set to the minimum inter-character space
146 // multiplied by this constant.
147 #define maxWideCharSpacingMul 1.3
148
149 // Upper limit on spacing between characters in a word.
150 #define maxWideCharSpacing 0.4
151
152 // Max difference in primary,secondary coordinates (as a fraction of
153 // the font size) allowed for duplicated text (fake boldface, drop
154 // shadows) which is to be discarded.
155 #define dupMaxPriDelta 0.1
156 #define dupMaxSecDelta 0.2
157
158 // Max width of underlines (in points).
159 #define maxUnderlineWidth 3
160
161 // Min distance between baseline and underline (in points).
162 //~ this should be font-size-dependent
163 #define minUnderlineGap -2
164
165 // Max distance between baseline and underline (in points).
166 //~ this should be font-size-dependent
167 #define maxUnderlineGap 4
168
169 // Max horizontal distance between edge of word and start of underline
170 // (in points).
171 //~ this should be font-size-dependent
172 #define underlineSlack 1
173
174 // Max distance between edge of text and edge of link border
175 #define hyperlinkSlack 2
176
177 // Max distance between characters when combining a base character and
178 // combining character
179 #define combMaxMidDelta 0.3
180 #define combMaxBaseDelta 0.4
181
182 // Text is considered diagonal if abs(tan(angle)) > diagonalThreshold.
183 // (Or 1/tan(angle) for 90/270 degrees.)
184 #define diagonalThreshold 0.1
185
186 // How opaque a selection on a glyphless font should be. Since the font is
187 // glyphless and overlaid over text in image form, this must enable users
188 // to read the underlying image. Issue #157
189 #define glyphlessSelectionOpacity 0.4
190
191 // Returns whether x is between a and b or equal to a or b.
192 // a and b don't need to be sorted.
193 #define XBetweenAB(x, a, b) (!(((x) > (a) && (x) > (b)) || ((x) < (a) && (x) < (b))) ? true : false)
194
195 namespace {
196
isAscii7(Unicode uchar)197 inline bool isAscii7(Unicode uchar)
198 {
199 return uchar < 128;
200 }
201
202 }
203
reorderText(const Unicode * text,int len,const UnicodeMap * uMap,bool primaryLR,GooString * s,Unicode * u)204 static int reorderText(const Unicode *text, int len, const UnicodeMap *uMap, bool primaryLR, GooString *s, Unicode *u)
205 {
206 char lre[8], rle[8], popdf[8], buf[8];
207 int lreLen = 0, rleLen = 0, popdfLen = 0, n;
208 int nCols, i, j, k;
209
210 nCols = 0;
211
212 if (s) {
213 lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
214 rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
215 popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
216 }
217
218 if (primaryLR) {
219 i = 0;
220 while (i < len) {
221 // output a left-to-right section
222 for (j = i; j < len && !unicodeTypeR(text[j]); ++j)
223 ;
224 for (k = i; k < j; ++k) {
225 if (s) {
226 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
227 s->append(buf, n);
228 }
229 if (u)
230 u[nCols] = text[k];
231 ++nCols;
232 }
233 i = j;
234 // output a right-to-left section
235 for (j = i; j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); ++j)
236 ;
237 if (j > i) {
238 if (s)
239 s->append(rle, rleLen);
240 for (k = j - 1; k >= i; --k) {
241 if (s) {
242 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
243 s->append(buf, n);
244 }
245 if (u)
246 u[nCols] = text[k];
247 ++nCols;
248 }
249 if (s)
250 s->append(popdf, popdfLen);
251 i = j;
252 }
253 }
254 } else {
255 // Note: This code treats numeric characters (European and
256 // Arabic/Indic) as left-to-right, which isn't strictly correct
257 // (incurs extra LRE/POPDF pairs), but does produce correct
258 // visual formatting.
259 if (s)
260 s->append(rle, rleLen);
261 i = len - 1;
262 while (i >= 0) {
263 // output a right-to-left section
264 for (j = i; j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j])); --j)
265 ;
266 for (k = i; k > j; --k) {
267 if (s) {
268 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
269 s->append(buf, n);
270 }
271 if (u)
272 u[nCols] = text[k];
273 ++nCols;
274 }
275 i = j;
276 // output a left-to-right section
277 for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j)
278 ;
279 if (j < i) {
280 if (s)
281 s->append(lre, lreLen);
282 for (k = j + 1; k <= i; ++k) {
283 if (s) {
284 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
285 s->append(buf, n);
286 }
287 if (u)
288 u[nCols] = text[k];
289 ++nCols;
290 }
291 if (s)
292 s->append(popdf, popdfLen);
293 i = j;
294 }
295 }
296 if (s)
297 s->append(popdf, popdfLen);
298 }
299
300 return nCols;
301 }
302
303 //------------------------------------------------------------------------
304 // TextUnderline
305 //------------------------------------------------------------------------
306
307 class TextUnderline
308 {
309 public:
TextUnderline(double x0A,double y0A,double x1A,double y1A)310 TextUnderline(double x0A, double y0A, double x1A, double y1A)
311 {
312 x0 = x0A;
313 y0 = y0A;
314 x1 = x1A;
315 y1 = y1A;
316 horiz = y0 == y1;
317 }
~TextUnderline()318 ~TextUnderline() { }
319
320 double x0, y0, x1, y1;
321 bool horiz;
322 };
323
324 //------------------------------------------------------------------------
325 // TextLink
326 //------------------------------------------------------------------------
327
328 class TextLink
329 {
330 public:
TextLink(int xMinA,int yMinA,int xMaxA,int yMaxA,AnnotLink * linkA)331 TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, AnnotLink *linkA)
332 {
333 xMin = xMinA;
334 yMin = yMinA;
335 xMax = xMaxA;
336 yMax = yMaxA;
337 link = linkA;
338 }
~TextLink()339 ~TextLink() { }
340
341 int xMin, yMin, xMax, yMax;
342 AnnotLink *link;
343 };
344
345 //------------------------------------------------------------------------
346 // TextFontInfo
347 //------------------------------------------------------------------------
348
TextFontInfo(const GfxState * state)349 TextFontInfo::TextFontInfo(const GfxState *state)
350 {
351 gfxFont = state->getFont();
352 if (gfxFont)
353 gfxFont->incRefCnt();
354 #ifdef TEXTOUT_WORD_LIST
355 fontName = (gfxFont && gfxFont->getName()) ? gfxFont->getName()->copy() : nullptr;
356 flags = gfxFont ? gfxFont->getFlags() : 0;
357 #endif
358 }
359
~TextFontInfo()360 TextFontInfo::~TextFontInfo()
361 {
362 if (gfxFont)
363 gfxFont->decRefCnt();
364 #ifdef TEXTOUT_WORD_LIST
365 if (fontName) {
366 delete fontName;
367 }
368 #endif
369 }
370
matches(const GfxState * state) const371 bool TextFontInfo::matches(const GfxState *state) const
372 {
373 return state->getFont() == gfxFont;
374 }
375
matches(const TextFontInfo * fontInfo) const376 bool TextFontInfo::matches(const TextFontInfo *fontInfo) const
377 {
378 return gfxFont == fontInfo->gfxFont;
379 }
380
matches(const Ref * ref) const381 bool TextFontInfo::matches(const Ref *ref) const
382 {
383 return (*(gfxFont->getID()) == *ref);
384 }
385
getAscent() const386 double TextFontInfo::getAscent() const
387 {
388 return gfxFont ? gfxFont->getAscent() : 0.95;
389 }
390
getDescent() const391 double TextFontInfo::getDescent() const
392 {
393 return gfxFont ? gfxFont->getDescent() : -0.35;
394 }
395
getWMode() const396 int TextFontInfo::getWMode() const
397 {
398 return gfxFont ? gfxFont->getWMode() : 0;
399 }
400
401 //------------------------------------------------------------------------
402 // TextWord
403 //------------------------------------------------------------------------
404
TextWord(const GfxState * state,int rotA,double fontSizeA)405 TextWord::TextWord(const GfxState *state, int rotA, double fontSizeA)
406 {
407 rot = rotA;
408 fontSize = fontSizeA;
409 text = nullptr;
410 charcode = nullptr;
411 edge = nullptr;
412 charPos = nullptr;
413 font = nullptr;
414 textMat = nullptr;
415 len = size = 0;
416 spaceAfter = false;
417 next = nullptr;
418 invisible = state->getRender() == 3;
419
420 #ifdef TEXTOUT_WORD_LIST
421 GfxRGB rgb;
422
423 if ((state->getRender() & 3) == 1) {
424 state->getStrokeRGB(&rgb);
425 } else {
426 state->getFillRGB(&rgb);
427 }
428 colorR = colToDbl(rgb.r);
429 colorG = colToDbl(rgb.g);
430 colorB = colToDbl(rgb.b);
431 #endif
432
433 underlined = false;
434 link = nullptr;
435 }
436
~TextWord()437 TextWord::~TextWord()
438 {
439 gfree(text);
440 gfree(charcode);
441 gfree(edge);
442 gfree(charPos);
443 gfree(font);
444 gfree(textMat);
445 }
446
addChar(const GfxState * state,TextFontInfo * fontA,double x,double y,double dx,double dy,int charPosA,int charLen,CharCode c,Unicode u,const Matrix & textMatA)447 void TextWord::addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
448 {
449 ensureCapacity(len + 1);
450 text[len] = u;
451 charcode[len] = c;
452 charPos[len] = charPosA;
453 charPos[len + 1] = charPosA + charLen;
454 font[len] = fontA;
455 textMat[len] = textMatA;
456
457 if (len == 0)
458 setInitialBounds(fontA, x, y);
459
460 if (wMode) { // vertical writing mode
461 // NB: the rotation value has been incremented by 1 (in
462 // TextPage::beginWord()) for vertical writing mode
463 switch (rot) {
464 case 0:
465 edge[len] = x - fontSize;
466 xMax = edge[len + 1] = x;
467 break;
468 case 1:
469 edge[len] = y - fontSize;
470 yMax = edge[len + 1] = y;
471 break;
472 case 2:
473 edge[len] = x + fontSize;
474 xMin = edge[len + 1] = x;
475 break;
476 case 3:
477 edge[len] = y + fontSize;
478 yMin = edge[len + 1] = y;
479 break;
480 }
481 } else { // horizontal writing mode
482 switch (rot) {
483 case 0:
484 edge[len] = x;
485 xMax = edge[len + 1] = x + dx;
486 break;
487 case 1:
488 edge[len] = y;
489 yMax = edge[len + 1] = y + dy;
490 break;
491 case 2:
492 edge[len] = x;
493 xMin = edge[len + 1] = x + dx;
494 break;
495 case 3:
496 edge[len] = y;
497 yMin = edge[len + 1] = y + dy;
498 break;
499 }
500 }
501 ++len;
502 }
503
setInitialBounds(TextFontInfo * fontA,double x,double y)504 void TextWord::setInitialBounds(TextFontInfo *fontA, double x, double y)
505 {
506 double ascent = fontA->getAscent() * fontSize;
507 double descent = fontA->getDescent() * fontSize;
508 wMode = fontA->getWMode();
509
510 if (wMode) { // vertical writing mode
511 // NB: the rotation value has been incremented by 1 (in
512 // TextPage::beginWord()) for vertical writing mode
513 switch (rot) {
514 case 0:
515 xMin = x - fontSize;
516 yMin = y - fontSize;
517 yMax = y;
518 base = y;
519 break;
520 case 1:
521 xMin = x;
522 yMin = y - fontSize;
523 xMax = x + fontSize;
524 base = x;
525 break;
526 case 2:
527 yMin = y;
528 xMax = x + fontSize;
529 yMax = y + fontSize;
530 base = y;
531 break;
532 case 3:
533 xMin = x - fontSize;
534 xMax = x;
535 yMax = y + fontSize;
536 base = x;
537 break;
538 }
539 } else { // horizontal writing mode
540 switch (rot) {
541 case 0:
542 xMin = x;
543 yMin = y - ascent;
544 yMax = y - descent;
545 if (yMin == yMax) {
546 // this is a sanity check for a case that shouldn't happen -- but
547 // if it does happen, we want to avoid dividing by zero later
548 yMin = y;
549 yMax = y + 1;
550 }
551 base = y;
552 break;
553 case 1:
554 xMin = x + descent;
555 yMin = y;
556 xMax = x + ascent;
557 if (xMin == xMax) {
558 // this is a sanity check for a case that shouldn't happen -- but
559 // if it does happen, we want to avoid dividing by zero later
560 xMin = x;
561 xMax = x + 1;
562 }
563 base = x;
564 break;
565 case 2:
566 yMin = y + descent;
567 xMax = x;
568 yMax = y + ascent;
569 if (yMin == yMax) {
570 // this is a sanity check for a case that shouldn't happen -- but
571 // if it does happen, we want to avoid dividing by zero later
572 yMin = y;
573 yMax = y + 1;
574 }
575 base = y;
576 break;
577 case 3:
578 xMin = x - ascent;
579 xMax = x - descent;
580 yMax = y;
581 if (xMin == xMax) {
582 // this is a sanity check for a case that shouldn't happen -- but
583 // if it does happen, we want to avoid dividing by zero later
584 xMin = x;
585 xMax = x + 1;
586 }
587 base = x;
588 break;
589 }
590 }
591 }
592
ensureCapacity(int capacity)593 void TextWord::ensureCapacity(int capacity)
594 {
595 if (capacity > size) {
596 size = std::max(size + 16, capacity);
597 text = (Unicode *)greallocn(text, size, sizeof(Unicode));
598 charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode));
599 edge = (double *)greallocn(edge, (size + 1), sizeof(double));
600 charPos = (int *)greallocn(charPos, size + 1, sizeof(int));
601 font = (TextFontInfo **)greallocn(font, size, sizeof(TextFontInfo *));
602 textMat = (Matrix *)greallocn(textMat, size, sizeof(Matrix));
603 }
604 }
605
606 struct CombiningTable
607 {
608 Unicode base;
609 Unicode comb;
610 };
611
612 static const struct CombiningTable combiningTable[] = {
613 { 0x0060, 0x0300 }, // grave
614 { 0x00a8, 0x0308 }, // dieresis
615 { 0x00af, 0x0304 }, // macron
616 { 0x00b4, 0x0301 }, // acute
617 { 0x00b8, 0x0327 }, // cedilla
618 { 0x02c6, 0x0302 }, // circumflex
619 { 0x02c7, 0x030c }, // caron
620 { 0x02d8, 0x0306 }, // breve
621 { 0x02d9, 0x0307 }, // dotaccent
622 { 0x02da, 0x030a }, // ring
623 { 0x02dc, 0x0303 }, // tilde
624 { 0x02dd, 0x030b } // hungarumlaut (double acute accent)
625 };
626
627 // returning combining versions of characters
getCombiningChar(Unicode u)628 static Unicode getCombiningChar(Unicode u)
629 {
630 for (const CombiningTable &combining : combiningTable) {
631 if (u == combining.base)
632 return combining.comb;
633 }
634 return 0;
635 }
636
addCombining(const GfxState * state,TextFontInfo * fontA,double fontSizeA,double x,double y,double dx,double dy,int charPosA,int charLen,CharCode c,Unicode u,const Matrix & textMatA)637 bool TextWord::addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA)
638 {
639 if (len == 0 || wMode != 0 || fontA->getWMode() != 0)
640 return false;
641
642 Unicode cCurrent = getCombiningChar(u);
643 Unicode cPrev = getCombiningChar(text[len - 1]);
644 double edgeMid = (edge[len - 1] + edge[len]) / 2;
645 double charMid, maxScaledMidDelta, charBase, maxScaledBaseDelta;
646
647 if (cCurrent != 0 && unicodeTypeAlphaNum(text[len - 1])) {
648 // Current is a combining character, previous is base character
649 maxScaledMidDelta = fabs(edge[len] - edge[len - 1]) * combMaxMidDelta;
650 charMid = charBase = maxScaledBaseDelta = 0;
651
652 // Test if characters overlap
653 if (rot == 0 || rot == 2) {
654 charMid = x + (dx / 2);
655 charBase = y;
656 maxScaledBaseDelta = (yMax - yMin) * combMaxBaseDelta;
657 } else {
658 charMid = y + (dy / 2);
659 charBase = x;
660 maxScaledBaseDelta = (xMax - xMin) * combMaxBaseDelta;
661 }
662
663 if (fabs(charMid - edgeMid) >= maxScaledMidDelta || fabs(charBase - base) >= maxScaledBaseDelta)
664 return false;
665
666 // Add character, but don't adjust edge / bounding box because
667 // combining character's positioning could be odd.
668 ensureCapacity(len + 1);
669 text[len] = cCurrent;
670 charcode[len] = c;
671 charPos[len] = charPosA;
672 charPos[len + 1] = charPosA + charLen;
673 font[len] = fontA;
674 textMat[len] = textMatA;
675 edge[len + 1] = edge[len];
676 edge[len] = (edge[len + 1] + edge[len - 1]) / 2;
677 ++len;
678 return true;
679 }
680
681 if (cPrev != 0 && unicodeTypeAlphaNum(u)) {
682 // Previous is a combining character, current is base character
683 maxScaledBaseDelta = (fontA->getAscent() - fontA->getDescent()) * fontSizeA * combMaxBaseDelta;
684 charMid = charBase = maxScaledMidDelta = 0;
685
686 // Test if characters overlap
687 if (rot == 0 || rot == 2) {
688 charMid = x + (dx / 2);
689 charBase = y;
690 maxScaledMidDelta = fabs(dx * combMaxMidDelta);
691 } else {
692 charMid = y + (dy / 2);
693 charBase = x;
694 maxScaledMidDelta = fabs(dy * combMaxMidDelta);
695 }
696
697 if (fabs(charMid - edgeMid) >= maxScaledMidDelta || fabs(charBase - base) >= maxScaledBaseDelta)
698 return false;
699
700 // move combining character to after base character
701 ensureCapacity(len + 1);
702 fontSize = fontSizeA;
703 text[len] = cPrev;
704 charcode[len] = charcode[len - 1];
705 charPos[len] = charPosA;
706 charPos[len + 1] = charPosA + charLen;
707 font[len] = font[len - 1];
708 textMat[len] = textMat[len - 1];
709
710 text[len - 1] = u;
711 charcode[len - 1] = c;
712 font[len - 1] = fontA;
713 textMat[len - 1] = textMatA;
714
715 if (len == 1)
716 setInitialBounds(fontA, x, y);
717
718 // Updated edges / bounding box because we changed the base
719 // character.
720 if (wMode) {
721 switch (rot) {
722 case 0:
723 edge[len - 1] = x - fontSize;
724 xMax = edge[len + 1] = x;
725 break;
726 case 1:
727 edge[len - 1] = y - fontSize;
728 yMax = edge[len + 1] = y;
729 break;
730 case 2:
731 edge[len - 1] = x + fontSize;
732 xMin = edge[len + 1] = x;
733 break;
734 case 3:
735 edge[len - 1] = y + fontSize;
736 yMin = edge[len + 1] = y;
737 break;
738 }
739 } else {
740 switch (rot) {
741 case 0:
742 edge[len - 1] = x;
743 xMax = edge[len + 1] = x + dx;
744 break;
745 case 1:
746 edge[len - 1] = y;
747 yMax = edge[len + 1] = y + dy;
748 break;
749 case 2:
750 edge[len - 1] = x;
751 xMin = edge[len + 1] = x + dx;
752 break;
753 case 3:
754 edge[len - 1] = y;
755 yMin = edge[len + 1] = y + dy;
756 break;
757 }
758 }
759
760 edge[len] = (edge[len + 1] + edge[len - 1]) / 2;
761 ++len;
762 return true;
763 }
764 return false;
765 }
766
merge(TextWord * word)767 void TextWord::merge(TextWord *word)
768 {
769 int i;
770
771 if (word->xMin < xMin) {
772 xMin = word->xMin;
773 }
774 if (word->yMin < yMin) {
775 yMin = word->yMin;
776 }
777 if (word->xMax > xMax) {
778 xMax = word->xMax;
779 }
780 if (word->yMax > yMax) {
781 yMax = word->yMax;
782 }
783 ensureCapacity(len + word->len);
784 for (i = 0; i < word->len; ++i) {
785 text[len + i] = word->text[i];
786 charcode[len + i] = word->charcode[i];
787 edge[len + i] = word->edge[i];
788 charPos[len + i] = word->charPos[i];
789 font[len + i] = word->font[i];
790 textMat[len + i] = word->textMat[i];
791 }
792 edge[len + word->len] = word->edge[word->len];
793 charPos[len + word->len] = word->charPos[word->len];
794 len += word->len;
795 }
796
primaryCmp(const TextWord * word) const797 inline int TextWord::primaryCmp(const TextWord *word) const
798 {
799 double cmp;
800
801 cmp = 0; // make gcc happy
802 switch (rot) {
803 case 0:
804 cmp = xMin - word->xMin;
805 break;
806 case 1:
807 cmp = yMin - word->yMin;
808 break;
809 case 2:
810 cmp = word->xMax - xMax;
811 break;
812 case 3:
813 cmp = word->yMax - yMax;
814 break;
815 }
816 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
817 }
818
primaryDelta(const TextWord * word) const819 double TextWord::primaryDelta(const TextWord *word) const
820 {
821 double delta;
822
823 delta = 0; // make gcc happy
824 switch (rot) {
825 case 0:
826 delta = word->xMin - xMax;
827 break;
828 case 1:
829 delta = word->yMin - yMax;
830 break;
831 case 2:
832 delta = xMin - word->xMax;
833 break;
834 case 3:
835 delta = yMin - word->yMax;
836 break;
837 }
838 return delta;
839 }
840
cmpYX(const void * p1,const void * p2)841 int TextWord::cmpYX(const void *p1, const void *p2)
842 {
843 TextWord *word1 = *(TextWord **)p1;
844 TextWord *word2 = *(TextWord **)p2;
845 double cmp;
846
847 cmp = word1->yMin - word2->yMin;
848 if (cmp == 0) {
849 cmp = word1->xMin - word2->xMin;
850 }
851 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
852 }
853
854 #ifdef TEXTOUT_WORD_LIST
855
getText() const856 GooString *TextWord::getText() const
857 {
858 GooString *s;
859 const UnicodeMap *uMap;
860 char buf[8];
861 int n, i;
862
863 s = new GooString();
864 if (!(uMap = globalParams->getTextEncoding())) {
865 return s;
866 }
867 for (i = 0; i < len; ++i) {
868 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
869 s->append(buf, n);
870 }
871 return s;
872 }
873
getCharBBox(int charIdx,double * xMinA,double * yMinA,double * xMaxA,double * yMaxA) const874 void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const
875 {
876 if (charIdx < 0 || charIdx >= len) {
877 return;
878 }
879 switch (rot) {
880 case 0:
881 *xMinA = edge[charIdx];
882 *xMaxA = edge[charIdx + 1];
883 *yMinA = yMin;
884 *yMaxA = yMax;
885 break;
886 case 1:
887 *xMinA = xMin;
888 *xMaxA = xMax;
889 *yMinA = edge[charIdx];
890 *yMaxA = edge[charIdx + 1];
891 break;
892 case 2:
893 *xMinA = edge[charIdx + 1];
894 *xMaxA = edge[charIdx];
895 *yMinA = yMin;
896 *yMaxA = yMax;
897 break;
898 case 3:
899 *xMinA = xMin;
900 *xMaxA = xMax;
901 *yMinA = edge[charIdx + 1];
902 *yMaxA = edge[charIdx];
903 break;
904 }
905 }
906
907 #endif // TEXTOUT_WORD_LIST
908
909 //------------------------------------------------------------------------
910 // TextPool
911 //------------------------------------------------------------------------
912
TextPool()913 TextPool::TextPool()
914 {
915 minBaseIdx = 0;
916 maxBaseIdx = -1;
917 pool = nullptr;
918 cursor = nullptr;
919 cursorBaseIdx = -1;
920 }
921
~TextPool()922 TextPool::~TextPool()
923 {
924 int baseIdx;
925 TextWord *word, *word2;
926
927 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
928 for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
929 word2 = word->next;
930 delete word;
931 }
932 }
933 gfree(pool);
934 }
935
getBaseIdx(double base) const936 int TextPool::getBaseIdx(double base) const
937 {
938 const double baseIdxDouble = base / textPoolStep;
939 if (std::isnan(baseIdxDouble) || baseIdxDouble < minBaseIdx) {
940 return minBaseIdx;
941 }
942 if (baseIdxDouble > maxBaseIdx) {
943 return maxBaseIdx;
944 }
945 return (int)baseIdxDouble;
946 }
947
addWord(TextWord * word)948 void TextPool::addWord(TextWord *word)
949 {
950 int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
951 TextWord *w0, *w1;
952
953 // expand the array if needed
954 wordBaseIdx = (int)(word->base / textPoolStep);
955 if (unlikely(wordBaseIdx <= INT_MIN + 128 || wordBaseIdx >= INT_MAX - 128)) {
956 error(errSyntaxWarning, -1, "wordBaseIdx out of range");
957 delete word;
958 return;
959 }
960 if (minBaseIdx > maxBaseIdx) {
961 minBaseIdx = wordBaseIdx - 128;
962 maxBaseIdx = wordBaseIdx + 128;
963 pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1, sizeof(TextWord *));
964 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
965 pool[baseIdx - minBaseIdx] = nullptr;
966 }
967 } else if (wordBaseIdx < minBaseIdx) {
968 newMinBaseIdx = wordBaseIdx - 128;
969 TextWord **newPool = (TextWord **)gmallocn_checkoverflow(maxBaseIdx - newMinBaseIdx + 1, sizeof(TextWord *));
970 if (unlikely(!newPool)) {
971 error(errSyntaxWarning, -1, "newPool would overflow");
972 delete word;
973 return;
974 }
975 for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
976 newPool[baseIdx - newMinBaseIdx] = nullptr;
977 }
978 memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool, (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
979 gfree(pool);
980 pool = newPool;
981 minBaseIdx = newMinBaseIdx;
982 } else if (wordBaseIdx > maxBaseIdx) {
983 newMaxBaseIdx = wordBaseIdx + 128;
984 TextWord **reallocatedPool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1, sizeof(TextWord *), true /*checkoverflow*/, false /*free_pool*/);
985 if (!reallocatedPool) {
986 error(errSyntaxWarning, -1, "new pool size would overflow");
987 delete word;
988 return;
989 }
990 pool = reallocatedPool;
991 for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
992 pool[baseIdx - minBaseIdx] = nullptr;
993 }
994 maxBaseIdx = newMaxBaseIdx;
995 }
996
997 // insert the new word
998 if (cursor && wordBaseIdx == cursorBaseIdx && word->primaryCmp(cursor) >= 0) {
999 w0 = cursor;
1000 w1 = cursor->next;
1001 } else {
1002 w0 = nullptr;
1003 w1 = pool[wordBaseIdx - minBaseIdx];
1004 }
1005 for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next)
1006 ;
1007 word->next = w1;
1008 if (w0) {
1009 w0->next = word;
1010 } else {
1011 pool[wordBaseIdx - minBaseIdx] = word;
1012 }
1013 cursor = word;
1014 cursorBaseIdx = wordBaseIdx;
1015 }
1016
1017 //------------------------------------------------------------------------
1018 // TextLine
1019 //------------------------------------------------------------------------
1020
TextLine(TextBlock * blkA,int rotA,double baseA)1021 TextLine::TextLine(TextBlock *blkA, int rotA, double baseA)
1022 {
1023 blk = blkA;
1024 rot = rotA;
1025 base = baseA;
1026 words = lastWord = nullptr;
1027 text = nullptr;
1028 edge = nullptr;
1029 col = nullptr;
1030 len = 0;
1031 convertedLen = 0;
1032 hyphenated = false;
1033 next = nullptr;
1034 xMin = yMin = 0;
1035 xMax = yMax = -1;
1036 normalized = nullptr;
1037 normalized_len = 0;
1038 normalized_idx = nullptr;
1039 ascii_translation = nullptr;
1040 ascii_len = 0;
1041 ascii_idx = nullptr;
1042 }
1043
~TextLine()1044 TextLine::~TextLine()
1045 {
1046 TextWord *word;
1047
1048 while (words) {
1049 word = words;
1050 words = words->next;
1051 delete word;
1052 }
1053 gfree(text);
1054 gfree(edge);
1055 gfree(col);
1056 if (normalized) {
1057 gfree(normalized);
1058 gfree(normalized_idx);
1059 }
1060 if (ascii_translation) {
1061 gfree(ascii_translation);
1062 gfree(ascii_idx);
1063 }
1064 }
1065
addWord(TextWord * word)1066 void TextLine::addWord(TextWord *word)
1067 {
1068 if (lastWord) {
1069 lastWord->next = word;
1070 } else {
1071 words = word;
1072 }
1073 lastWord = word;
1074
1075 if (xMin > xMax) {
1076 xMin = word->xMin;
1077 xMax = word->xMax;
1078 yMin = word->yMin;
1079 yMax = word->yMax;
1080 } else {
1081 if (word->xMin < xMin) {
1082 xMin = word->xMin;
1083 }
1084 if (word->xMax > xMax) {
1085 xMax = word->xMax;
1086 }
1087 if (word->yMin < yMin) {
1088 yMin = word->yMin;
1089 }
1090 if (word->yMax > yMax) {
1091 yMax = word->yMax;
1092 }
1093 }
1094 }
1095
primaryDelta(const TextLine * line) const1096 double TextLine::primaryDelta(const TextLine *line) const
1097 {
1098 double delta;
1099
1100 delta = 0; // make gcc happy
1101 switch (rot) {
1102 case 0:
1103 delta = line->xMin - xMax;
1104 break;
1105 case 1:
1106 delta = line->yMin - yMax;
1107 break;
1108 case 2:
1109 delta = xMin - line->xMax;
1110 break;
1111 case 3:
1112 delta = yMin - line->yMax;
1113 break;
1114 }
1115 return delta;
1116 }
1117
primaryCmp(const TextLine * line) const1118 int TextLine::primaryCmp(const TextLine *line) const
1119 {
1120 double cmp;
1121
1122 cmp = 0; // make gcc happy
1123 switch (rot) {
1124 case 0:
1125 cmp = xMin - line->xMin;
1126 break;
1127 case 1:
1128 cmp = yMin - line->yMin;
1129 break;
1130 case 2:
1131 cmp = line->xMax - xMax;
1132 break;
1133 case 3:
1134 cmp = line->yMax - yMax;
1135 break;
1136 }
1137 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1138 }
1139
secondaryCmp(const TextLine * line) const1140 int TextLine::secondaryCmp(const TextLine *line) const
1141 {
1142 double cmp;
1143
1144 cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
1145 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1146 }
1147
cmpYX(const TextLine * line) const1148 int TextLine::cmpYX(const TextLine *line) const
1149 {
1150 int cmp;
1151
1152 if ((cmp = secondaryCmp(line))) {
1153 return cmp;
1154 }
1155 return primaryCmp(line);
1156 }
1157
cmpXY(const void * p1,const void * p2)1158 int TextLine::cmpXY(const void *p1, const void *p2)
1159 {
1160 TextLine *line1 = *(TextLine **)p1;
1161 TextLine *line2 = *(TextLine **)p2;
1162 int cmp;
1163
1164 if ((cmp = line1->primaryCmp(line2))) {
1165 return cmp;
1166 }
1167 return line1->secondaryCmp(line2);
1168 }
1169
coalesce(const UnicodeMap * uMap)1170 void TextLine::coalesce(const UnicodeMap *uMap)
1171 {
1172 TextWord *word0, *word1;
1173 double space, delta, minSpace;
1174 bool isUnicode;
1175 char buf[8];
1176 int i, j;
1177
1178 if (words->next) {
1179
1180 // compute the inter-word space threshold
1181 if (words->len > 1 || words->next->len > 1) {
1182 minSpace = 0;
1183 } else {
1184 minSpace = words->primaryDelta(words->next);
1185 for (word0 = words->next, word1 = word0->next; word1 && minSpace > 0; word0 = word1, word1 = word0->next) {
1186 if (word1->len > 1) {
1187 minSpace = 0;
1188 }
1189 delta = word0->primaryDelta(word1);
1190 if (delta < minSpace) {
1191 minSpace = delta;
1192 }
1193 }
1194 }
1195 if (minSpace <= 0) {
1196 space = maxCharSpacing * words->fontSize;
1197 } else {
1198 space = maxWideCharSpacingMul * minSpace;
1199 if (space > maxWideCharSpacing * words->fontSize) {
1200 space = maxWideCharSpacing * words->fontSize;
1201 }
1202 }
1203
1204 // merge words
1205 word0 = words;
1206 word1 = words->next;
1207 while (word1) {
1208 if (word0->primaryDelta(word1) >= space) {
1209 word0->spaceAfter = true;
1210 word0 = word1;
1211 word1 = word1->next;
1212 } else if (word0->font[word0->len - 1] == word1->font[0] && word0->underlined == word1->underlined && fabs(word0->fontSize - word1->fontSize) < maxWordFontSizeDelta * words->fontSize
1213 && word1->charPos[0] == word0->charPos[word0->len]) {
1214 word0->merge(word1);
1215 word0->next = word1->next;
1216 delete word1;
1217 word1 = word0->next;
1218 } else {
1219 word0 = word1;
1220 word1 = word1->next;
1221 }
1222 }
1223 }
1224
1225 // build the line text
1226 isUnicode = uMap ? uMap->isUnicode() : false;
1227 len = 0;
1228 for (word1 = words; word1; word1 = word1->next) {
1229 len += word1->len;
1230 if (word1->spaceAfter) {
1231 ++len;
1232 }
1233 }
1234 text = (Unicode *)gmallocn(len, sizeof(Unicode));
1235 edge = (double *)gmallocn(len + 1, sizeof(double));
1236 i = 0;
1237 for (word1 = words; word1; word1 = word1->next) {
1238 for (j = 0; j < word1->len; ++j) {
1239 text[i] = word1->text[j];
1240 edge[i] = word1->edge[j];
1241 ++i;
1242 }
1243 edge[i] = word1->edge[word1->len];
1244 if (word1->spaceAfter) {
1245 text[i] = (Unicode)0x0020;
1246 ++i;
1247 }
1248 }
1249
1250 // compute convertedLen and set up the col array
1251 col = (int *)gmallocn(len + 1, sizeof(int));
1252 convertedLen = 0;
1253 for (i = 0; i < len; ++i) {
1254 col[i] = convertedLen;
1255 if (isUnicode) {
1256 ++convertedLen;
1257 } else if (uMap) {
1258 convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
1259 }
1260 }
1261 col[len] = convertedLen;
1262
1263 // check for hyphen at end of line
1264 //~ need to check for other chars used as hyphens
1265 hyphenated = text[len - 1] == (Unicode)'-';
1266 }
1267
1268 //------------------------------------------------------------------------
1269 // TextLineFrag
1270 //------------------------------------------------------------------------
1271
1272 class TextLineFrag
1273 {
1274 public:
1275 TextLine *line; // the line object
1276 int start, len; // offset and length of this fragment
1277 // (in Unicode chars)
1278 double xMin, xMax; // bounding box coordinates
1279 double yMin, yMax;
1280 double base; // baseline virtual coordinate
1281 int col; // first column
1282
1283 void init(TextLine *lineA, int startA, int lenA);
1284 void computeCoords(bool oneRot);
1285
1286 static int cmpYXPrimaryRot(const void *p1, const void *p2);
1287 static int cmpYXLineRot(const void *p1, const void *p2);
1288 static int cmpXYLineRot(const void *p1, const void *p2);
1289 static int cmpXYColumnPrimaryRot(const void *p1, const void *p2);
1290 static int cmpXYColumnLineRot(const void *p1, const void *p2);
1291 };
1292
init(TextLine * lineA,int startA,int lenA)1293 void TextLineFrag::init(TextLine *lineA, int startA, int lenA)
1294 {
1295 line = lineA;
1296 start = startA;
1297 len = lenA;
1298 col = line->col[start];
1299 }
1300
computeCoords(bool oneRot)1301 void TextLineFrag::computeCoords(bool oneRot)
1302 {
1303 TextBlock *blk;
1304 double d0, d1, d2, d3, d4;
1305
1306 if (oneRot) {
1307
1308 switch (line->rot) {
1309 case 0:
1310 xMin = line->edge[start];
1311 xMax = line->edge[start + len];
1312 yMin = line->yMin;
1313 yMax = line->yMax;
1314 break;
1315 case 1:
1316 xMin = line->xMin;
1317 xMax = line->xMax;
1318 yMin = line->edge[start];
1319 yMax = line->edge[start + len];
1320 break;
1321 case 2:
1322 xMin = line->edge[start + len];
1323 xMax = line->edge[start];
1324 yMin = line->yMin;
1325 yMax = line->yMax;
1326 break;
1327 case 3:
1328 xMin = line->xMin;
1329 xMax = line->xMax;
1330 yMin = line->edge[start + len];
1331 yMax = line->edge[start];
1332 break;
1333 }
1334 base = line->base;
1335
1336 } else {
1337
1338 if (line->rot == 0 && line->blk->page->primaryRot == 0) {
1339
1340 xMin = line->edge[start];
1341 xMax = line->edge[start + len];
1342 yMin = line->yMin;
1343 yMax = line->yMax;
1344 base = line->base;
1345
1346 } else {
1347
1348 blk = line->blk;
1349 d0 = line->edge[start];
1350 d1 = line->edge[start + len];
1351 d2 = d3 = d4 = 0; // make gcc happy
1352
1353 switch (line->rot) {
1354 case 0:
1355 d2 = line->yMin;
1356 d3 = line->yMax;
1357 d4 = line->base;
1358 d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
1359 d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
1360 d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
1361 d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
1362 d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
1363 break;
1364 case 1:
1365 d2 = line->xMax;
1366 d3 = line->xMin;
1367 d4 = line->base;
1368 d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
1369 d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
1370 d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
1371 d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
1372 d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
1373 break;
1374 case 2:
1375 d2 = line->yMax;
1376 d3 = line->yMin;
1377 d4 = line->base;
1378 d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
1379 d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
1380 d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
1381 d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
1382 d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
1383 break;
1384 case 3:
1385 d2 = line->xMin;
1386 d3 = line->xMax;
1387 d4 = line->base;
1388 d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
1389 d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
1390 d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
1391 d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
1392 d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
1393 break;
1394 }
1395
1396 switch (line->blk->page->primaryRot) {
1397 case 0:
1398 xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
1399 xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
1400 yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
1401 yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
1402 base = blk->yMin + d4 * (blk->yMax - blk->yMin);
1403 break;
1404 case 1:
1405 xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
1406 xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
1407 yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
1408 yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
1409 base = blk->xMax - d4 * (blk->xMax - blk->xMin);
1410 break;
1411 case 2:
1412 xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
1413 xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
1414 yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
1415 yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
1416 base = blk->yMax - d4 * (blk->yMax - blk->yMin);
1417 break;
1418 case 3:
1419 xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
1420 xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
1421 yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
1422 yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
1423 base = blk->xMin + d4 * (blk->xMax - blk->xMin);
1424 break;
1425 }
1426 }
1427 }
1428 }
1429
cmpYXPrimaryRot(const void * p1,const void * p2)1430 int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2)
1431 {
1432 TextLineFrag *frag1 = (TextLineFrag *)p1;
1433 TextLineFrag *frag2 = (TextLineFrag *)p2;
1434 double cmp;
1435
1436 cmp = 0; // make gcc happy
1437 switch (frag1->line->blk->page->primaryRot) {
1438 case 0:
1439 if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) {
1440 cmp = frag1->xMin - frag2->xMin;
1441 }
1442 break;
1443 case 1:
1444 if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) {
1445 cmp = frag1->yMin - frag2->yMin;
1446 }
1447 break;
1448 case 2:
1449 if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) {
1450 cmp = frag2->xMax - frag1->xMax;
1451 }
1452 break;
1453 case 3:
1454 if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) {
1455 cmp = frag2->yMax - frag1->yMax;
1456 }
1457 break;
1458 }
1459 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1460 }
1461
cmpYXLineRot(const void * p1,const void * p2)1462 int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2)
1463 {
1464 TextLineFrag *frag1 = (TextLineFrag *)p1;
1465 TextLineFrag *frag2 = (TextLineFrag *)p2;
1466 double cmp;
1467
1468 cmp = 0; // make gcc happy
1469 switch (frag1->line->rot) {
1470 case 0:
1471 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1472 cmp = frag1->xMin - frag2->xMin;
1473 }
1474 break;
1475 case 1:
1476 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1477 cmp = frag1->yMin - frag2->yMin;
1478 }
1479 break;
1480 case 2:
1481 if ((cmp = frag2->yMin - frag1->yMin) == 0) {
1482 cmp = frag2->xMax - frag1->xMax;
1483 }
1484 break;
1485 case 3:
1486 if ((cmp = frag1->xMax - frag2->xMax) == 0) {
1487 cmp = frag2->yMax - frag1->yMax;
1488 }
1489 break;
1490 }
1491 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1492 }
1493
cmpXYLineRot(const void * p1,const void * p2)1494 int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2)
1495 {
1496 TextLineFrag *frag1 = (TextLineFrag *)p1;
1497 TextLineFrag *frag2 = (TextLineFrag *)p2;
1498 double cmp;
1499
1500 cmp = 0; // make gcc happy
1501 switch (frag1->line->rot) {
1502 case 0:
1503 if ((cmp = frag1->xMin - frag2->xMin) == 0) {
1504 cmp = frag1->yMin - frag2->yMin;
1505 }
1506 break;
1507 case 1:
1508 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1509 cmp = frag2->xMax - frag1->xMax;
1510 }
1511 break;
1512 case 2:
1513 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1514 cmp = frag2->yMin - frag1->yMin;
1515 }
1516 break;
1517 case 3:
1518 if ((cmp = frag2->yMax - frag1->yMax) == 0) {
1519 cmp = frag1->xMax - frag2->xMax;
1520 }
1521 break;
1522 }
1523 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1524 }
1525
cmpXYColumnPrimaryRot(const void * p1,const void * p2)1526 int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2)
1527 {
1528 TextLineFrag *frag1 = (TextLineFrag *)p1;
1529 TextLineFrag *frag2 = (TextLineFrag *)p2;
1530 double cmp;
1531
1532 // if columns overlap, compare y values
1533 if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) {
1534 cmp = 0; // make gcc happy
1535 switch (frag1->line->blk->page->primaryRot) {
1536 case 0:
1537 cmp = frag1->yMin - frag2->yMin;
1538 break;
1539 case 1:
1540 cmp = frag2->xMax - frag1->xMax;
1541 break;
1542 case 2:
1543 cmp = frag2->yMin - frag1->yMin;
1544 break;
1545 case 3:
1546 cmp = frag1->xMax - frag2->xMax;
1547 break;
1548 }
1549 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1550 }
1551
1552 // otherwise, compare starting column
1553 return frag1->col - frag2->col;
1554 }
1555
cmpXYColumnLineRot(const void * p1,const void * p2)1556 int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2)
1557 {
1558 TextLineFrag *frag1 = (TextLineFrag *)p1;
1559 TextLineFrag *frag2 = (TextLineFrag *)p2;
1560 double cmp;
1561
1562 // if columns overlap, compare y values
1563 if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] - frag2->line->col[frag2->start]) && frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start])) {
1564 cmp = 0; // make gcc happy
1565 switch (frag1->line->rot) {
1566 case 0:
1567 cmp = frag1->yMin - frag2->yMin;
1568 break;
1569 case 1:
1570 cmp = frag2->xMax - frag1->xMax;
1571 break;
1572 case 2:
1573 cmp = frag2->yMin - frag1->yMin;
1574 break;
1575 case 3:
1576 cmp = frag1->xMax - frag2->xMax;
1577 break;
1578 }
1579 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1580 }
1581
1582 // otherwise, compare starting column
1583 return frag1->col - frag2->col;
1584 }
1585
1586 //------------------------------------------------------------------------
1587 // TextBlock
1588 //------------------------------------------------------------------------
1589
TextBlock(TextPage * pageA,int rotA)1590 TextBlock::TextBlock(TextPage *pageA, int rotA)
1591 {
1592 page = pageA;
1593 rot = rotA;
1594 xMin = yMin = 0;
1595 xMax = yMax = -1;
1596 priMin = 0;
1597 priMax = page->pageWidth;
1598 pool = new TextPool();
1599 lines = nullptr;
1600 curLine = nullptr;
1601 next = nullptr;
1602 stackNext = nullptr;
1603 tableId = -1;
1604 tableEnd = false;
1605 }
1606
~TextBlock()1607 TextBlock::~TextBlock()
1608 {
1609 TextLine *line;
1610
1611 delete pool;
1612 while (lines) {
1613 line = lines;
1614 lines = lines->next;
1615 delete line;
1616 }
1617 }
1618
addWord(TextWord * word)1619 void TextBlock::addWord(TextWord *word)
1620 {
1621 pool->addWord(word);
1622 if (xMin > xMax) {
1623 xMin = word->xMin;
1624 xMax = word->xMax;
1625 yMin = word->yMin;
1626 yMax = word->yMax;
1627 } else {
1628 if (word->xMin < xMin) {
1629 xMin = word->xMin;
1630 }
1631 if (word->xMax > xMax) {
1632 xMax = word->xMax;
1633 }
1634 if (word->yMin < yMin) {
1635 yMin = word->yMin;
1636 }
1637 if (word->yMax > yMax) {
1638 yMax = word->yMax;
1639 }
1640 }
1641 }
1642
coalesce(const UnicodeMap * uMap,double fixedPitch)1643 void TextBlock::coalesce(const UnicodeMap *uMap, double fixedPitch)
1644 {
1645 TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1646 TextLine *line, *line0, *line1;
1647 int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1648 int baseIdx, bestWordBaseIdx, idx0, idx1;
1649 double minBase, maxBase;
1650 double fontSize, wordSpacing, delta, priDelta, secDelta;
1651 TextLine **lineArray;
1652 bool found, overlap;
1653 int col1, col2;
1654 int i, j, k;
1655
1656 // discard duplicated text (fake boldface, drop shadows)
1657 for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1658 word0 = pool->getPool(idx0);
1659 while (word0) {
1660 priDelta = dupMaxPriDelta * word0->fontSize;
1661 secDelta = dupMaxSecDelta * word0->fontSize;
1662 maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1663 found = false;
1664 word1 = word2 = nullptr; // make gcc happy
1665 for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1666 if (idx1 == idx0) {
1667 word1 = word0;
1668 word2 = word0->next;
1669 } else {
1670 word1 = nullptr;
1671 word2 = pool->getPool(idx1);
1672 }
1673 for (; word2; word1 = word2, word2 = word2->next) {
1674 if (word2->len == word0->len && !memcmp(word2->text, word0->text, word0->len * sizeof(Unicode))) {
1675 switch (rot) {
1676 case 0:
1677 case 2:
1678 found = fabs(word0->xMin - word2->xMin) < priDelta && fabs(word0->xMax - word2->xMax) < priDelta && fabs(word0->yMin - word2->yMin) < secDelta && fabs(word0->yMax - word2->yMax) < secDelta;
1679 break;
1680 case 1:
1681 case 3:
1682 found = fabs(word0->xMin - word2->xMin) < secDelta && fabs(word0->xMax - word2->xMax) < secDelta && fabs(word0->yMin - word2->yMin) < priDelta && fabs(word0->yMax - word2->yMax) < priDelta;
1683 break;
1684 }
1685 }
1686 if (found) {
1687 break;
1688 }
1689 }
1690 if (found) {
1691 break;
1692 }
1693 }
1694 if (found) {
1695 if (word1) {
1696 word1->next = word2->next;
1697 } else {
1698 pool->setPool(idx1, word2->next);
1699 }
1700 delete word2;
1701 } else {
1702 word0 = word0->next;
1703 }
1704 }
1705 }
1706
1707 // build the lines
1708 curLine = nullptr;
1709 poolMinBaseIdx = pool->minBaseIdx;
1710 charCount = 0;
1711 nLines = 0;
1712 while (true) {
1713
1714 // find the first non-empty line in the pool
1715 for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx); ++poolMinBaseIdx)
1716 ;
1717 if (poolMinBaseIdx > pool->maxBaseIdx) {
1718 break;
1719 }
1720
1721 // look for the left-most word in the first four lines of the
1722 // pool -- this avoids starting with a superscript word
1723 startBaseIdx = poolMinBaseIdx;
1724 for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) {
1725 if (!pool->getPool(baseIdx)) {
1726 continue;
1727 }
1728 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) < 0) {
1729 startBaseIdx = baseIdx;
1730 }
1731 }
1732
1733 // create a new line
1734 word0 = pool->getPool(startBaseIdx);
1735 pool->setPool(startBaseIdx, word0->next);
1736 word0->next = nullptr;
1737 line = new TextLine(this, word0->rot, word0->base);
1738 line->addWord(word0);
1739 lastWord = word0;
1740
1741 // compute the search range
1742 fontSize = word0->fontSize;
1743 minBase = word0->base - maxIntraLineDelta * fontSize;
1744 maxBase = word0->base + maxIntraLineDelta * fontSize;
1745 minBaseIdx = pool->getBaseIdx(minBase);
1746 maxBaseIdx = pool->getBaseIdx(maxBase);
1747 wordSpacing = fixedPitch ? fixedPitch : maxWordSpacing * fontSize;
1748
1749 // find the rest of the words in this line
1750 while (true) {
1751
1752 // find the left-most word whose baseline is in the range for
1753 // this line
1754 bestWordBaseIdx = 0;
1755 bestWord0 = bestWord1 = nullptr;
1756 overlap = false;
1757 for (baseIdx = minBaseIdx; !overlap && baseIdx <= maxBaseIdx; ++baseIdx) {
1758 for (word0 = nullptr, word1 = pool->getPool(baseIdx); word1; word0 = word1, word1 = word1->next) {
1759 if (word1->base >= minBase && word1->base <= maxBase) {
1760 delta = lastWord->primaryDelta(word1);
1761 if (delta < minCharSpacing * fontSize) {
1762 overlap = true;
1763 break;
1764 } else {
1765 if (delta < wordSpacing && (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1766 bestWordBaseIdx = baseIdx;
1767 bestWord0 = word0;
1768 bestWord1 = word1;
1769 }
1770 break;
1771 }
1772 }
1773 }
1774 }
1775 if (overlap || !bestWord1) {
1776 break;
1777 }
1778
1779 // remove it from the pool, and add it to the line
1780 if (bestWord0) {
1781 bestWord0->next = bestWord1->next;
1782 } else {
1783 pool->setPool(bestWordBaseIdx, bestWord1->next);
1784 }
1785 bestWord1->next = nullptr;
1786 line->addWord(bestWord1);
1787 lastWord = bestWord1;
1788 }
1789
1790 // add the line
1791 if (curLine && line->cmpYX(curLine) > 0) {
1792 line0 = curLine;
1793 line1 = curLine->next;
1794 } else {
1795 line0 = nullptr;
1796 line1 = lines;
1797 }
1798 for (; line1 && line->cmpYX(line1) > 0; line0 = line1, line1 = line1->next)
1799 ;
1800 if (line0) {
1801 line0->next = line;
1802 } else {
1803 lines = line;
1804 }
1805 line->next = line1;
1806 curLine = line;
1807 line->coalesce(uMap);
1808 charCount += line->len;
1809 ++nLines;
1810 }
1811
1812 // sort lines into xy order for column assignment
1813 lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *));
1814 for (line = lines, i = 0; line; line = line->next, ++i) {
1815 lineArray[i] = line;
1816 }
1817 qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1818
1819 // column assignment
1820 nColumns = 0;
1821 if (fixedPitch) {
1822 for (i = 0; i < nLines; ++i) {
1823 line0 = lineArray[i];
1824 col1 = 0; // make gcc happy
1825 switch (rot) {
1826 case 0:
1827 col1 = (int)((line0->xMin - xMin) / fixedPitch + 0.5);
1828 break;
1829 case 1:
1830 col1 = (int)((line0->yMin - yMin) / fixedPitch + 0.5);
1831 break;
1832 case 2:
1833 col1 = (int)((xMax - line0->xMax) / fixedPitch + 0.5);
1834 break;
1835 case 3:
1836 col1 = (int)((yMax - line0->yMax) / fixedPitch + 0.5);
1837 break;
1838 }
1839 for (k = 0; k <= line0->len; ++k) {
1840 line0->col[k] += col1;
1841 }
1842 if (line0->col[line0->len] > nColumns) {
1843 nColumns = line0->col[line0->len];
1844 }
1845 }
1846 } else {
1847 for (i = 0; i < nLines; ++i) {
1848 line0 = lineArray[i];
1849 col1 = 0;
1850 for (j = 0; j < i; ++j) {
1851 line1 = lineArray[j];
1852 if (line1->primaryDelta(line0) >= 0) {
1853 col2 = line1->col[line1->len] + 1;
1854 } else {
1855 k = 0; // make gcc happy
1856 switch (rot) {
1857 case 0:
1858 for (k = 0; k < line1->len && line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k)
1859 ;
1860 break;
1861 case 1:
1862 for (k = 0; k < line1->len && line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k)
1863 ;
1864 break;
1865 case 2:
1866 for (k = 0; k < line1->len && line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k)
1867 ;
1868 break;
1869 case 3:
1870 for (k = 0; k < line1->len && line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k + 1]); ++k)
1871 ;
1872 break;
1873 }
1874 col2 = line1->col[k];
1875 }
1876 if (col2 > col1) {
1877 col1 = col2;
1878 }
1879 }
1880 for (k = 0; k <= line0->len; ++k) {
1881 line0->col[k] += col1;
1882 }
1883 if (line0->col[line0->len] > nColumns) {
1884 nColumns = line0->col[line0->len];
1885 }
1886 }
1887 }
1888 gfree(lineArray);
1889 }
1890
updatePriMinMax(const TextBlock * blk)1891 void TextBlock::updatePriMinMax(const TextBlock *blk)
1892 {
1893 double newPriMin, newPriMax;
1894 bool gotPriMin, gotPriMax;
1895
1896 gotPriMin = gotPriMax = false;
1897 newPriMin = newPriMax = 0; // make gcc happy
1898 switch (page->primaryRot) {
1899 case 0:
1900 case 2:
1901 if (blk->yMin < yMax && blk->yMax > yMin) {
1902 if (blk->xMin < xMin) {
1903 newPriMin = blk->xMax;
1904 gotPriMin = true;
1905 }
1906 if (blk->xMax > xMax) {
1907 newPriMax = blk->xMin;
1908 gotPriMax = true;
1909 }
1910 }
1911 break;
1912 case 1:
1913 case 3:
1914 if (blk->xMin < xMax && blk->xMax > xMin) {
1915 if (blk->yMin < yMin) {
1916 newPriMin = blk->yMax;
1917 gotPriMin = true;
1918 }
1919 if (blk->yMax > yMax) {
1920 newPriMax = blk->yMin;
1921 gotPriMax = true;
1922 }
1923 }
1924 break;
1925 }
1926 if (gotPriMin) {
1927 if (newPriMin > xMin) {
1928 newPriMin = xMin;
1929 }
1930 if (newPriMin > priMin) {
1931 priMin = newPriMin;
1932 }
1933 }
1934 if (gotPriMax) {
1935 if (newPriMax < xMax) {
1936 newPriMax = xMax;
1937 }
1938 if (newPriMax < priMax) {
1939 priMax = newPriMax;
1940 }
1941 }
1942 }
1943
cmpXYPrimaryRot(const void * p1,const void * p2)1944 int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2)
1945 {
1946 TextBlock *blk1 = *(TextBlock **)p1;
1947 TextBlock *blk2 = *(TextBlock **)p2;
1948 double cmp;
1949
1950 cmp = 0; // make gcc happy
1951 switch (blk1->page->primaryRot) {
1952 case 0:
1953 if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1954 cmp = blk1->yMin - blk2->yMin;
1955 }
1956 break;
1957 case 1:
1958 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1959 cmp = blk2->xMax - blk1->xMax;
1960 }
1961 break;
1962 case 2:
1963 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1964 cmp = blk2->yMin - blk1->yMin;
1965 }
1966 break;
1967 case 3:
1968 if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1969 cmp = blk1->xMax - blk2->xMax;
1970 }
1971 break;
1972 }
1973 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1974 }
1975
cmpYXPrimaryRot(const void * p1,const void * p2)1976 int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2)
1977 {
1978 TextBlock *blk1 = *(TextBlock **)p1;
1979 TextBlock *blk2 = *(TextBlock **)p2;
1980 double cmp;
1981
1982 cmp = 0; // make gcc happy
1983 switch (blk1->page->primaryRot) {
1984 case 0:
1985 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1986 cmp = blk1->xMin - blk2->xMin;
1987 }
1988 break;
1989 case 1:
1990 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1991 cmp = blk1->yMin - blk2->yMin;
1992 }
1993 break;
1994 case 2:
1995 if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1996 cmp = blk2->xMax - blk1->xMax;
1997 }
1998 break;
1999 case 3:
2000 if ((cmp = blk1->xMax - blk2->xMax) == 0) {
2001 cmp = blk2->yMax - blk1->yMax;
2002 }
2003 break;
2004 }
2005 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
2006 }
2007
primaryCmp(const TextBlock * blk) const2008 int TextBlock::primaryCmp(const TextBlock *blk) const
2009 {
2010 double cmp;
2011
2012 cmp = 0; // make gcc happy
2013 switch (rot) {
2014 case 0:
2015 cmp = xMin - blk->xMin;
2016 break;
2017 case 1:
2018 cmp = yMin - blk->yMin;
2019 break;
2020 case 2:
2021 cmp = blk->xMax - xMax;
2022 break;
2023 case 3:
2024 cmp = blk->yMax - yMax;
2025 break;
2026 }
2027 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
2028 }
2029
secondaryDelta(const TextBlock * blk) const2030 double TextBlock::secondaryDelta(const TextBlock *blk) const
2031 {
2032 double delta;
2033
2034 delta = 0; // make gcc happy
2035 switch (rot) {
2036 case 0:
2037 delta = blk->yMin - yMax;
2038 break;
2039 case 1:
2040 delta = xMin - blk->xMax;
2041 break;
2042 case 2:
2043 delta = yMin - blk->yMax;
2044 break;
2045 case 3:
2046 delta = blk->xMin - xMax;
2047 break;
2048 }
2049 return delta;
2050 }
2051
isBelow(const TextBlock * blk) const2052 bool TextBlock::isBelow(const TextBlock *blk) const
2053 {
2054 bool below;
2055
2056 below = false; // make gcc happy
2057 switch (page->primaryRot) {
2058 case 0:
2059 below = xMin >= blk->priMin && xMax <= blk->priMax && yMin > blk->yMin;
2060 break;
2061 case 1:
2062 below = yMin >= blk->priMin && yMax <= blk->priMax && xMax < blk->xMax;
2063 break;
2064 case 2:
2065 below = xMin >= blk->priMin && xMax <= blk->priMax && yMax < blk->yMax;
2066 break;
2067 case 3:
2068 below = yMin >= blk->priMin && yMax <= blk->priMax && xMin > blk->xMin;
2069 break;
2070 }
2071
2072 return below;
2073 }
2074
isBeforeByRule1(const TextBlock * blk1)2075 bool TextBlock::isBeforeByRule1(const TextBlock *blk1)
2076 {
2077 bool before = false;
2078 bool overlap = false;
2079
2080 switch (this->page->primaryRot) {
2081 case 0:
2082 case 2:
2083 overlap = ((this->ExMin <= blk1->ExMin) && (blk1->ExMin <= this->ExMax)) || ((blk1->ExMin <= this->ExMin) && (this->ExMin <= blk1->ExMax));
2084 break;
2085 case 1:
2086 case 3:
2087 overlap = ((this->EyMin <= blk1->EyMin) && (blk1->EyMin <= this->EyMax)) || ((blk1->EyMin <= this->EyMin) && (this->EyMin <= blk1->EyMax));
2088 break;
2089 }
2090 switch (this->page->primaryRot) {
2091 case 0:
2092 before = overlap && this->EyMin < blk1->EyMin;
2093 break;
2094 case 1:
2095 before = overlap && this->ExMax > blk1->ExMax;
2096 break;
2097 case 2:
2098 before = overlap && this->EyMax > blk1->EyMax;
2099 break;
2100 case 3:
2101 before = overlap && this->ExMin < blk1->ExMin;
2102 break;
2103 }
2104 return before;
2105 }
2106
isBeforeByRule2(const TextBlock * blk1)2107 bool TextBlock::isBeforeByRule2(const TextBlock *blk1)
2108 {
2109 double cmp = 0;
2110 int rotLR = rot;
2111
2112 if (!page->primaryLR) {
2113 rotLR = (rotLR + 2) % 4;
2114 }
2115
2116 switch (rotLR) {
2117 case 0:
2118 cmp = ExMax - blk1->ExMin;
2119 break;
2120 case 1:
2121 cmp = EyMin - blk1->EyMax;
2122 break;
2123 case 2:
2124 cmp = blk1->ExMax - ExMin;
2125 break;
2126 case 3:
2127 cmp = blk1->EyMin - EyMax;
2128 break;
2129 }
2130 return cmp <= 0;
2131 }
2132
2133 // Sort into reading order by performing a topological sort using the rules
2134 // given in "High Performance Document Layout Analysis", T.M. Breuel, 2003.
2135 // See http://pubs.iupr.org/#2003-breuel-sdiut
2136 // Topological sort is done by depth first search, see
2137 // http://en.wikipedia.org/wiki/Topological_sorting
visitDepthFirst(TextBlock * blkList,int pos1,TextBlock ** sorted,int sortPos,bool * visited,TextBlock ** cache,int cacheSize)2138 int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited, TextBlock **cache, int cacheSize)
2139 {
2140 int pos2;
2141 TextBlock *blk1, *blk2, *blk3;
2142 bool before;
2143
2144 if (visited[pos1]) {
2145 return sortPos;
2146 }
2147
2148 blk1 = this;
2149
2150 #if 0 // for debugging
2151 printf("visited: %d %.2f..%.2f %.2f..%.2f\n",
2152 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2153 #endif
2154 visited[pos1] = true;
2155 pos2 = -1;
2156 for (blk2 = blkList; blk2; blk2 = blk2->next) {
2157 pos2++;
2158 if (visited[pos2]) {
2159 // skip visited nodes
2160 continue;
2161 }
2162 before = false;
2163
2164 // is blk2 before blk1? (for table entries)
2165 if (blk1->tableId >= 0 && blk1->tableId == blk2->tableId) {
2166 if (page->primaryLR) {
2167 if (blk2->xMax <= blk1->xMin && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin)
2168 before = true;
2169 } else {
2170 if (blk2->xMin >= blk1->xMax && blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin)
2171 before = true;
2172 }
2173
2174 if (blk2->yMax <= blk1->yMin)
2175 before = true;
2176 } else {
2177 if (blk2->isBeforeByRule1(blk1)) {
2178 // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1.
2179 before = true;
2180 #if 0 // for debugging
2181 printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2182 blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax,
2183 blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2184 #endif
2185 } else if (blk2->isBeforeByRule2(blk1)) {
2186 // Rule (2) blk2 left of blk1, and no intervening blk3
2187 // such that blk1 is before blk3 by rule 1,
2188 // and blk3 is before blk2 by rule 1.
2189 before = true;
2190 for (int i = 0; i < cacheSize && cache[i]; ++i) {
2191 if (blk1->isBeforeByRule1(cache[i]) && cache[i]->isBeforeByRule1(blk2)) {
2192 before = false;
2193 std::rotate(cache, cache + i, cache + i + 1);
2194 break;
2195 }
2196 }
2197
2198 if (before) {
2199 for (blk3 = blkList; blk3; blk3 = blk3->next) {
2200 if (blk3 == blk2 || blk3 == blk1) {
2201 continue;
2202 }
2203 if (blk1->isBeforeByRule1(blk3) && blk3->isBeforeByRule1(blk2)) {
2204 before = false;
2205 std::copy_backward(cache, cache + cacheSize - 1, cache + cacheSize);
2206 cache[0] = blk3;
2207 break;
2208 }
2209 }
2210 }
2211 #if 0 // for debugging
2212 if (before) {
2213 printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
2214 blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax,
2215 blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax);
2216 }
2217 #endif
2218 }
2219 }
2220 if (before) {
2221 // blk2 is before blk1, so it needs to be visited
2222 // before we can add blk1 to the sorted list.
2223 sortPos = blk2->visitDepthFirst(blkList, pos2, sorted, sortPos, visited, cache, cacheSize);
2224 }
2225 }
2226 #if 0 // for debugging
2227 printf("sorted: %d %.2f..%.2f %.2f..%.2f\n",
2228 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
2229 #endif
2230 sorted[sortPos++] = blk1;
2231 return sortPos;
2232 }
2233
visitDepthFirst(TextBlock * blkList,int pos1,TextBlock ** sorted,int sortPos,bool * visited)2234 int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited)
2235 {
2236 const int blockCacheSize = 4;
2237 TextBlock *blockCache[blockCacheSize];
2238 std::fill(blockCache, blockCache + blockCacheSize, nullptr);
2239 return visitDepthFirst(blkList, pos1, sorted, sortPos, visited, blockCache, blockCacheSize);
2240 }
2241
2242 //------------------------------------------------------------------------
2243 // TextFlow
2244 //------------------------------------------------------------------------
2245
TextFlow(TextPage * pageA,TextBlock * blk)2246 TextFlow::TextFlow(TextPage *pageA, TextBlock *blk)
2247 {
2248 page = pageA;
2249 xMin = blk->xMin;
2250 xMax = blk->xMax;
2251 yMin = blk->yMin;
2252 yMax = blk->yMax;
2253 priMin = blk->priMin;
2254 priMax = blk->priMax;
2255 blocks = lastBlk = blk;
2256 next = nullptr;
2257 }
2258
~TextFlow()2259 TextFlow::~TextFlow()
2260 {
2261 TextBlock *blk;
2262
2263 while (blocks) {
2264 blk = blocks;
2265 blocks = blocks->next;
2266 delete blk;
2267 }
2268 }
2269
addBlock(TextBlock * blk)2270 void TextFlow::addBlock(TextBlock *blk)
2271 {
2272 if (lastBlk) {
2273 lastBlk->next = blk;
2274 } else {
2275 blocks = blk;
2276 }
2277 lastBlk = blk;
2278 if (blk->xMin < xMin) {
2279 xMin = blk->xMin;
2280 }
2281 if (blk->xMax > xMax) {
2282 xMax = blk->xMax;
2283 }
2284 if (blk->yMin < yMin) {
2285 yMin = blk->yMin;
2286 }
2287 if (blk->yMax > yMax) {
2288 yMax = blk->yMax;
2289 }
2290 }
2291
blockFits(const TextBlock * blk,const TextBlock * prevBlk) const2292 bool TextFlow::blockFits(const TextBlock *blk, const TextBlock *prevBlk) const
2293 {
2294 bool fits;
2295
2296 // lower blocks must use smaller fonts
2297 if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
2298 return false;
2299 }
2300
2301 fits = false; // make gcc happy
2302 switch (page->primaryRot) {
2303 case 0:
2304 fits = blk->xMin >= priMin && blk->xMax <= priMax;
2305 break;
2306 case 1:
2307 fits = blk->yMin >= priMin && blk->yMax <= priMax;
2308 break;
2309 case 2:
2310 fits = blk->xMin >= priMin && blk->xMax <= priMax;
2311 break;
2312 case 3:
2313 fits = blk->yMin >= priMin && blk->yMax <= priMax;
2314 break;
2315 }
2316 return fits;
2317 }
2318
2319 #ifdef TEXTOUT_WORD_LIST
2320
2321 //------------------------------------------------------------------------
2322 // TextWordList
2323 //------------------------------------------------------------------------
2324
TextWordList(const TextPage * text,bool physLayout)2325 TextWordList::TextWordList(const TextPage *text, bool physLayout)
2326 {
2327 TextFlow *flow;
2328 TextBlock *blk;
2329 TextLine *line;
2330 TextWord *word;
2331 TextWord **wordArray;
2332 int nWords, i;
2333
2334 if (text->rawOrder) {
2335 for (word = text->rawWords; word; word = word->next) {
2336 words.push_back(word);
2337 }
2338
2339 } else if (physLayout) {
2340 // this is inefficient, but it's also the least useful of these
2341 // three cases
2342 nWords = 0;
2343 for (flow = text->flows; flow; flow = flow->next) {
2344 for (blk = flow->blocks; blk; blk = blk->next) {
2345 for (line = blk->lines; line; line = line->next) {
2346 for (word = line->words; word; word = word->next) {
2347 ++nWords;
2348 }
2349 }
2350 }
2351 }
2352 wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *));
2353 i = 0;
2354 for (flow = text->flows; flow; flow = flow->next) {
2355 for (blk = flow->blocks; blk; blk = blk->next) {
2356 for (line = blk->lines; line; line = line->next) {
2357 for (word = line->words; word; word = word->next) {
2358 wordArray[i++] = word;
2359 }
2360 }
2361 }
2362 }
2363 qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
2364 for (i = 0; i < nWords; ++i) {
2365 words.push_back(wordArray[i]);
2366 }
2367 gfree(wordArray);
2368
2369 } else {
2370 for (flow = text->flows; flow; flow = flow->next) {
2371 for (blk = flow->blocks; blk; blk = blk->next) {
2372 for (line = blk->lines; line; line = line->next) {
2373 for (word = line->words; word; word = word->next) {
2374 words.push_back(word);
2375 }
2376 }
2377 }
2378 }
2379 }
2380 }
2381
~TextWordList()2382 TextWordList::~TextWordList() { }
2383
getLength() const2384 int TextWordList::getLength() const
2385 {
2386 return words.size();
2387 }
2388
get(int idx)2389 TextWord *TextWordList::get(int idx)
2390 {
2391 if (idx < 0 || idx >= (int)words.size()) {
2392 return nullptr;
2393 }
2394 return words[idx];
2395 }
2396
2397 #endif // TEXTOUT_WORD_LIST
2398
2399 //------------------------------------------------------------------------
2400 // TextPage
2401 //------------------------------------------------------------------------
2402
TextPage(bool rawOrderA,bool discardDiagA)2403 TextPage::TextPage(bool rawOrderA, bool discardDiagA)
2404 {
2405 int rot;
2406
2407 refCnt = 1;
2408 rawOrder = rawOrderA;
2409 discardDiag = discardDiagA;
2410 curWord = nullptr;
2411 charPos = 0;
2412 curFont = nullptr;
2413 curFontSize = 0;
2414 nest = 0;
2415 nTinyChars = 0;
2416 lastCharOverlap = false;
2417 if (!rawOrder) {
2418 for (rot = 0; rot < 4; ++rot) {
2419 pools[rot] = std::make_unique<TextPool>();
2420 }
2421 }
2422 flows = nullptr;
2423 blocks = nullptr;
2424 rawWords = nullptr;
2425 rawLastWord = nullptr;
2426 lastFindXMin = lastFindYMin = 0;
2427 haveLastFind = false;
2428 mergeCombining = true;
2429 diagonal = false;
2430 }
2431
~TextPage()2432 TextPage::~TextPage()
2433 {
2434 clear();
2435 }
2436
incRefCnt()2437 void TextPage::incRefCnt()
2438 {
2439 refCnt++;
2440 }
2441
decRefCnt()2442 void TextPage::decRefCnt()
2443 {
2444 if (--refCnt == 0)
2445 delete this;
2446 }
2447
startPage(const GfxState * state)2448 void TextPage::startPage(const GfxState *state)
2449 {
2450 clear();
2451 if (state) {
2452 pageWidth = state->getPageWidth();
2453 pageHeight = state->getPageHeight();
2454 } else {
2455 pageWidth = pageHeight = 0;
2456 }
2457 }
2458
endPage()2459 void TextPage::endPage()
2460 {
2461 if (curWord) {
2462 endWord();
2463 }
2464 }
2465
clear()2466 void TextPage::clear()
2467 {
2468 int rot;
2469 TextFlow *flow;
2470 TextWord *word;
2471
2472 if (curWord) {
2473 delete curWord;
2474 curWord = nullptr;
2475 }
2476 if (rawOrder) {
2477 while (rawWords) {
2478 word = rawWords;
2479 rawWords = rawWords->next;
2480 delete word;
2481 }
2482 } else {
2483 for (rot = 0; rot < 4; ++rot) {
2484 pools[rot] = std::make_unique<TextPool>();
2485 }
2486 while (flows) {
2487 flow = flows;
2488 flows = flows->next;
2489 delete flow;
2490 }
2491 gfree(blocks);
2492 }
2493 fonts.clear();
2494 underlines.clear();
2495 links.clear();
2496
2497 diagonal = false;
2498 curWord = nullptr;
2499 charPos = 0;
2500 curFont = nullptr;
2501 curFontSize = 0;
2502 nest = 0;
2503 nTinyChars = 0;
2504 flows = nullptr;
2505 blocks = nullptr;
2506 rawWords = nullptr;
2507 rawLastWord = nullptr;
2508 }
2509
updateFont(const GfxState * state)2510 void TextPage::updateFont(const GfxState *state)
2511 {
2512 GfxFont *gfxFont;
2513 const double *fm;
2514 const char *name;
2515 int code, mCode, letterCode, anyCode;
2516 double w;
2517
2518 // get the font info object
2519 curFont = nullptr;
2520 for (const std::unique_ptr<TextFontInfo> &f : fonts) {
2521 if (f->matches(state)) {
2522 curFont = f.get();
2523 break;
2524 }
2525 }
2526 if (!curFont) {
2527 fonts.emplace_back(std::make_unique<TextFontInfo>(state));
2528 curFont = fonts.back().get();
2529 }
2530
2531 // adjust the font size
2532 gfxFont = state->getFont();
2533 curFontSize = state->getTransformedFontSize();
2534 if (gfxFont && gfxFont->getType() == fontType3) {
2535 // This is a hack which makes it possible to deal with some Type 3
2536 // fonts. The problem is that it's impossible to know what the
2537 // base coordinate system used in the font is without actually
2538 // rendering the font. This code tries to guess by looking at the
2539 // width of the character 'm' (which breaks if the font is a
2540 // subset that doesn't contain 'm').
2541 mCode = letterCode = anyCode = -1;
2542 for (code = 0; code < 256; ++code) {
2543 name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
2544 int nameLen = name ? strlen(name) : 0;
2545 bool nameOneChar = nameLen == 1 || (nameLen > 1 && name[1] == '\0');
2546 if (nameOneChar && name[0] == 'm') {
2547 mCode = code;
2548 }
2549 if (letterCode < 0 && nameOneChar && ((name[0] >= 'A' && name[0] <= 'Z') || (name[0] >= 'a' && name[0] <= 'z'))) {
2550 letterCode = code;
2551 }
2552 if (anyCode < 0 && name && ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
2553 anyCode = code;
2554 }
2555 }
2556 if (mCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
2557 // 0.6 is a generic average 'm' width -- yes, this is a hack
2558 curFontSize *= w / 0.6;
2559 } else if (letterCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
2560 // even more of a hack: 0.5 is a generic letter width
2561 curFontSize *= w / 0.5;
2562 } else if (anyCode >= 0 && (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
2563 // better than nothing: 0.5 is a generic character width
2564 curFontSize *= w / 0.5;
2565 }
2566 fm = gfxFont->getFontMatrix();
2567 if (fm[0] != 0) {
2568 curFontSize *= fabs(fm[3] / fm[0]);
2569 }
2570 }
2571 }
2572
beginWord(const GfxState * state)2573 void TextPage::beginWord(const GfxState *state)
2574 {
2575 GfxFont *gfxFont;
2576 const double *fontm;
2577 double m[4], m2[4];
2578 int rot;
2579
2580 // This check is needed because Type 3 characters can contain
2581 // text-drawing operations (when TextPage is being used via
2582 // {X,Win}SplashOutputDev rather than TextOutputDev).
2583 if (curWord) {
2584 ++nest;
2585 return;
2586 }
2587
2588 // compute the rotation
2589 state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
2590 gfxFont = state->getFont();
2591 if (gfxFont && gfxFont->getType() == fontType3) {
2592 fontm = state->getFont()->getFontMatrix();
2593 m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
2594 m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
2595 m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
2596 m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
2597 m[0] = m2[0];
2598 m[1] = m2[1];
2599 m[2] = m2[2];
2600 m[3] = m2[3];
2601 }
2602 if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
2603 rot = (m[0] > 0 || m[3] < 0) ? 0 : 2;
2604 } else {
2605 rot = (m[2] > 0) ? 1 : 3;
2606 }
2607 if (fabs(m[0]) >= fabs(m[1])) {
2608 diagonal = fabs(m[1]) > diagonalThreshold * fabs(m[0]);
2609 } else {
2610 diagonal = fabs(m[0]) > diagonalThreshold * fabs(m[1]);
2611 }
2612
2613 // for vertical writing mode, the lines are effectively rotated 90
2614 // degrees
2615 if (gfxFont && gfxFont->getWMode()) {
2616 rot = (rot + 1) & 3;
2617 }
2618
2619 curWord = new TextWord(state, rot, curFontSize);
2620 }
2621
addChar(const GfxState * state,double x,double y,double dx,double dy,CharCode c,int nBytes,const Unicode * u,int uLen)2622 void TextPage::addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen)
2623 {
2624 double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
2625 bool overlap;
2626 int i;
2627 int wMode;
2628 Matrix mat;
2629
2630 // subtract char and word spacing from the dx,dy values
2631 sp = state->getCharSpace();
2632 if (c == (CharCode)0x20) {
2633 sp += state->getWordSpace();
2634 }
2635 state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
2636 dx -= dx2;
2637 dy -= dy2;
2638 state->transformDelta(dx, dy, &w1, &h1);
2639
2640 // throw away chars that aren't inside the page bounds
2641 // (and also do a sanity check on the character size)
2642 state->transform(x, y, &x1, &y1);
2643 if (x1 + w1 < 0 || x1 > pageWidth || y1 + h1 < 0 || y1 > pageHeight || x1 != x1 || y1 != y1 || // IEEE way of checking for isnan
2644 w1 != w1 || h1 != h1) {
2645 charPos += nBytes;
2646 return;
2647 }
2648
2649 // check the tiny chars limit
2650 if (fabs(w1) < 3 && fabs(h1) < 3) {
2651 if (++nTinyChars > 50000) {
2652 charPos += nBytes;
2653 return;
2654 }
2655 }
2656
2657 // break words at space character
2658 if (uLen == 1 && UnicodeIsWhitespace(u[0])) {
2659 charPos += nBytes;
2660 endWord();
2661 return;
2662 } else if (uLen == 1 && u[0] == (Unicode)0x0) {
2663 // ignore null characters
2664 charPos += nBytes;
2665 return;
2666 }
2667
2668 state->getFontTransMat(&mat.m[0], &mat.m[1], &mat.m[2], &mat.m[3]);
2669 mat.m[0] *= state->getHorizScaling();
2670 mat.m[1] *= state->getHorizScaling();
2671 mat.m[4] = x1;
2672 mat.m[5] = y1;
2673
2674 if (mergeCombining && curWord && uLen == 1 && curWord->addCombining(state, curFont, curFontSize, x1, y1, w1, h1, charPos, nBytes, c, u[0], mat)) {
2675 charPos += nBytes;
2676 return;
2677 }
2678
2679 // start a new word if:
2680 // (1) this character doesn't fall in the right place relative to
2681 // the end of the previous word (this places upper and lower
2682 // constraints on the position deltas along both the primary
2683 // and secondary axes), or
2684 // (2) this character overlaps the previous one (duplicated text), or
2685 // (3) the previous character was an overlap (we want each duplicated
2686 // character to be in a word by itself at this stage),
2687 // (4) the font size has changed
2688 // (5) the WMode changed
2689 if (curWord && curWord->len > 0) {
2690 base = sp = delta = 0; // make gcc happy
2691 switch (curWord->rot) {
2692 case 0:
2693 base = y1;
2694 sp = x1 - curWord->xMax;
2695 delta = x1 - curWord->edge[curWord->len - 1];
2696 break;
2697 case 1:
2698 base = x1;
2699 sp = y1 - curWord->yMax;
2700 delta = y1 - curWord->edge[curWord->len - 1];
2701 break;
2702 case 2:
2703 base = y1;
2704 sp = curWord->xMin - x1;
2705 delta = curWord->edge[curWord->len - 1] - x1;
2706 break;
2707 case 3:
2708 base = x1;
2709 sp = curWord->yMin - y1;
2710 delta = curWord->edge[curWord->len - 1] - y1;
2711 break;
2712 }
2713 overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize && fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
2714 wMode = curFont->getWMode();
2715 if (overlap || lastCharOverlap || sp < -minDupBreakOverlap * curWord->fontSize || sp > minWordBreakSpace * curWord->fontSize || fabs(base - curWord->base) > 0.5 || curFontSize != curWord->fontSize || wMode != curWord->wMode) {
2716 endWord();
2717 }
2718 lastCharOverlap = overlap;
2719 } else {
2720 lastCharOverlap = false;
2721 }
2722
2723 if (uLen != 0) {
2724 // start a new word if needed
2725 if (!curWord) {
2726 beginWord(state);
2727 }
2728
2729 // throw away diagonal chars
2730 if (discardDiag && diagonal) {
2731 charPos += nBytes;
2732 return;
2733 }
2734
2735 // page rotation and/or transform matrices can cause text to be
2736 // drawn in reverse order -- in this case, swap the begin/end
2737 // coordinates and break text into individual chars
2738 if ((curWord->rot == 0 && w1 < 0) || (curWord->rot == 1 && h1 < 0) || (curWord->rot == 2 && w1 > 0) || (curWord->rot == 3 && h1 > 0)) {
2739 endWord();
2740 beginWord(state);
2741
2742 // throw away diagonal chars
2743 if (discardDiag && diagonal) {
2744 charPos += nBytes;
2745 return;
2746 }
2747
2748 x1 += w1;
2749 y1 += h1;
2750 w1 = -w1;
2751 h1 = -h1;
2752 }
2753
2754 // add the characters to the current word
2755 w1 /= uLen;
2756 h1 /= uLen;
2757 for (i = 0; i < uLen; ++i) {
2758 curWord->addChar(state, curFont, x1 + i * w1, y1 + i * h1, w1, h1, charPos, nBytes, c, u[i], mat);
2759 }
2760 }
2761 charPos += nBytes;
2762 }
2763
incCharCount(int nChars)2764 void TextPage::incCharCount(int nChars)
2765 {
2766 charPos += nChars;
2767 }
2768
endWord()2769 void TextPage::endWord()
2770 {
2771 // This check is needed because Type 3 characters can contain
2772 // text-drawing operations (when TextPage is being used via
2773 // {X,Win}SplashOutputDev rather than TextOutputDev).
2774 if (nest > 0) {
2775 --nest;
2776 return;
2777 }
2778
2779 if (curWord) {
2780 addWord(curWord);
2781 curWord = nullptr;
2782 }
2783 }
2784
addWord(TextWord * word)2785 void TextPage::addWord(TextWord *word)
2786 {
2787 // throw away zero-length words -- they don't have valid xMin/xMax
2788 // values, and they're useless anyway
2789 if (word->len == 0) {
2790 delete word;
2791 return;
2792 }
2793
2794 if (rawOrder) {
2795 if (rawLastWord) {
2796 rawLastWord->next = word;
2797 } else {
2798 rawWords = word;
2799 }
2800 rawLastWord = word;
2801 } else {
2802 pools[word->rot]->addWord(word);
2803 }
2804 }
2805
addUnderline(double x0,double y0,double x1,double y1)2806 void TextPage::addUnderline(double x0, double y0, double x1, double y1)
2807 {
2808 underlines.emplace_back(std::make_unique<TextUnderline>(x0, y0, x1, y1));
2809 }
2810
addLink(int xMin,int yMin,int xMax,int yMax,AnnotLink * link)2811 void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link)
2812 {
2813 links.emplace_back(std::make_unique<TextLink>(xMin, yMin, xMax, yMax, link));
2814 }
2815
coalesce(bool physLayout,double fixedPitch,bool doHTML)2816 void TextPage::coalesce(bool physLayout, double fixedPitch, bool doHTML)
2817 {
2818 TextWord *word0, *word1, *word2;
2819 TextLine *line;
2820 TextBlock *blkList, *blk, *lastBlk, *blk0, *blk1, *blk2;
2821 TextFlow *flow, *lastFlow;
2822 int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx;
2823 double minBase, maxBase, newMinBase, newMaxBase;
2824 double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
2825 bool found;
2826 int count[4];
2827 int lrCount;
2828 int col1, col2;
2829 int j, n;
2830
2831 if (rawOrder) {
2832 primaryRot = 0;
2833 primaryLR = true;
2834 return;
2835 }
2836
2837 const UnicodeMap *uMap = globalParams->getTextEncoding();
2838 blkList = nullptr;
2839 lastBlk = nullptr;
2840 nBlocks = 0;
2841 primaryRot = 0;
2842
2843 #if 0 // for debugging
2844 printf("*** initial words ***\n");
2845 for (rot = 0; rot < 4; ++rot) {
2846 pool = pools[rot];
2847 for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
2848 for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
2849 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2850 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2851 word0->base, word0->fontSize, rot*90, word0->link);
2852 for (i = 0; i < word0->len; ++i) {
2853 fputc(word0->text[i] & 0xff, stdout);
2854 }
2855 printf("'\n");
2856 }
2857 }
2858 }
2859 printf("\n");
2860 #endif
2861
2862 #if 0 //~ for debugging
2863 for (i = 0; i < underlines->getLength(); ++i) {
2864 underline = (TextUnderline *)underlines->get(i);
2865 printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2866 underline->x0, underline->x1, underline->y0, underline->y1,
2867 underline->horiz);
2868 }
2869 #endif
2870
2871 if (doHTML) {
2872
2873 //----- handle underlining
2874 for (const std::unique_ptr<TextUnderline> &underline : underlines) {
2875 if (underline->horiz) {
2876 // rot = 0
2877 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2878 startBaseIdx = pools[0]->getBaseIdx(underline->y0 + minUnderlineGap);
2879 endBaseIdx = pools[0]->getBaseIdx(underline->y0 + maxUnderlineGap);
2880 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2881 for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2882 //~ need to check the y value against the word baseline
2883 if (underline->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline->x1) {
2884 word0->underlined = true;
2885 }
2886 }
2887 }
2888 }
2889
2890 // rot = 2
2891 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2892 startBaseIdx = pools[2]->getBaseIdx(underline->y0 - maxUnderlineGap);
2893 endBaseIdx = pools[2]->getBaseIdx(underline->y0 - minUnderlineGap);
2894 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2895 for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2896 if (underline->x0 < word0->xMin + underlineSlack && word0->xMax - underlineSlack < underline->x1) {
2897 word0->underlined = true;
2898 }
2899 }
2900 }
2901 }
2902 } else {
2903 // rot = 1
2904 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2905 startBaseIdx = pools[1]->getBaseIdx(underline->x0 - maxUnderlineGap);
2906 endBaseIdx = pools[1]->getBaseIdx(underline->x0 - minUnderlineGap);
2907 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2908 for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2909 if (underline->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline->y1) {
2910 word0->underlined = true;
2911 }
2912 }
2913 }
2914 }
2915
2916 // rot = 3
2917 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2918 startBaseIdx = pools[3]->getBaseIdx(underline->x0 + minUnderlineGap);
2919 endBaseIdx = pools[3]->getBaseIdx(underline->x0 + maxUnderlineGap);
2920 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2921 for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2922 if (underline->y0 < word0->yMin + underlineSlack && word0->yMax - underlineSlack < underline->y1) {
2923 word0->underlined = true;
2924 }
2925 }
2926 }
2927 }
2928 }
2929 }
2930
2931 //----- handle links
2932 for (const std::unique_ptr<TextLink> &link : links) {
2933 // rot = 0
2934 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2935 startBaseIdx = pools[0]->getBaseIdx(link->yMin);
2936 endBaseIdx = pools[0]->getBaseIdx(link->yMax);
2937 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2938 for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2939 if (link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax && link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax) {
2940 word0->link = link->link;
2941 }
2942 }
2943 }
2944 }
2945
2946 // rot = 2
2947 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2948 startBaseIdx = pools[2]->getBaseIdx(link->yMin);
2949 endBaseIdx = pools[2]->getBaseIdx(link->yMax);
2950 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2951 for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2952 if (link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax && link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax) {
2953 word0->link = link->link;
2954 }
2955 }
2956 }
2957 }
2958
2959 // rot = 1
2960 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2961 startBaseIdx = pools[1]->getBaseIdx(link->xMin);
2962 endBaseIdx = pools[1]->getBaseIdx(link->xMax);
2963 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2964 for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2965 if (link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax && link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax) {
2966 word0->link = link->link;
2967 }
2968 }
2969 }
2970 }
2971
2972 // rot = 3
2973 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2974 startBaseIdx = pools[3]->getBaseIdx(link->xMin);
2975 endBaseIdx = pools[3]->getBaseIdx(link->xMax);
2976 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2977 for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2978 if (link->yMin < word0->yMin + hyperlinkSlack && word0->yMax - hyperlinkSlack < link->yMax && link->xMin < word0->xMin + hyperlinkSlack && word0->xMax - hyperlinkSlack < link->xMax) {
2979 word0->link = link->link;
2980 }
2981 }
2982 }
2983 }
2984 }
2985 }
2986
2987 //----- assemble the blocks
2988
2989 //~ add an outer loop for writing mode (vertical text)
2990
2991 // build blocks for each rotation value
2992 for (rot = 0; rot < 4; ++rot) {
2993 std::unique_ptr<TextPool> &pool = pools[rot];
2994 poolMinBaseIdx = pool->minBaseIdx;
2995 count[rot] = 0;
2996
2997 // add blocks until no more words are left
2998 while (true) {
2999
3000 // find the first non-empty line in the pool
3001 for (; poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx); ++poolMinBaseIdx)
3002 ;
3003 if (poolMinBaseIdx > pool->maxBaseIdx) {
3004 break;
3005 }
3006
3007 // look for the left-most word in the first four lines of the
3008 // pool -- this avoids starting with a superscript word
3009 startBaseIdx = poolMinBaseIdx;
3010 for (baseIdx = poolMinBaseIdx + 1; baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx; ++baseIdx) {
3011 if (!pool->getPool(baseIdx)) {
3012 continue;
3013 }
3014 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx)) < 0) {
3015 startBaseIdx = baseIdx;
3016 }
3017 }
3018
3019 // create a new block
3020 word0 = pool->getPool(startBaseIdx);
3021 pool->setPool(startBaseIdx, word0->next);
3022 word0->next = nullptr;
3023 blk = new TextBlock(this, rot);
3024 blk->addWord(word0);
3025
3026 fontSize = word0->fontSize;
3027 minBase = maxBase = word0->base;
3028 colSpace1 = minColSpacing1 * fontSize;
3029 colSpace2 = minColSpacing2 * fontSize;
3030 lineSpace = maxLineSpacingDelta * fontSize;
3031 intraLineSpace = maxIntraLineDelta * fontSize;
3032
3033 // add words to the block
3034 do {
3035 found = false;
3036
3037 // look for words on the line above the current top edge of
3038 // the block
3039 newMinBase = minBase;
3040 for (baseIdx = pool->getBaseIdx(minBase); baseIdx >= pool->getBaseIdx(minBase - lineSpace); --baseIdx) {
3041 word0 = nullptr;
3042 word1 = pool->getPool(baseIdx);
3043 while (word1) {
3044 if (word1->base < minBase && word1->base >= minBase - lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin))
3045 && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) {
3046 word2 = word1;
3047 if (word0) {
3048 word0->next = word1->next;
3049 } else {
3050 pool->setPool(baseIdx, word1->next);
3051 }
3052 word1 = word1->next;
3053 word2->next = nullptr;
3054 blk->addWord(word2);
3055 found = true;
3056 newMinBase = word2->base;
3057 } else {
3058 word0 = word1;
3059 word1 = word1->next;
3060 }
3061 }
3062 }
3063 minBase = newMinBase;
3064
3065 // look for words on the line below the current bottom edge of
3066 // the block
3067 newMaxBase = maxBase;
3068 for (baseIdx = pool->getBaseIdx(maxBase); baseIdx <= pool->getBaseIdx(maxBase + lineSpace); ++baseIdx) {
3069 word0 = nullptr;
3070 word1 = pool->getPool(baseIdx);
3071 while (word1) {
3072 if (word1->base > maxBase && word1->base <= maxBase + lineSpace && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin) : (word1->yMin < blk->yMax && word1->yMax > blk->yMin))
3073 && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta1 * fontSize) {
3074 word2 = word1;
3075 if (word0) {
3076 word0->next = word1->next;
3077 } else {
3078 pool->setPool(baseIdx, word1->next);
3079 }
3080 word1 = word1->next;
3081 word2->next = nullptr;
3082 blk->addWord(word2);
3083 found = true;
3084 newMaxBase = word2->base;
3085 } else {
3086 word0 = word1;
3087 word1 = word1->next;
3088 }
3089 }
3090 }
3091 maxBase = newMaxBase;
3092
3093 // look for words that are on lines already in the block, and
3094 // that overlap the block horizontally
3095 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3096 word0 = nullptr;
3097 word1 = pool->getPool(baseIdx);
3098 while (word1) {
3099 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3100 && ((rot == 0 || rot == 2) ? (word1->xMin < blk->xMax + colSpace1 && word1->xMax > blk->xMin - colSpace1) : (word1->yMin < blk->yMax + colSpace1 && word1->yMax > blk->yMin - colSpace1))
3101 && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta2 * fontSize) {
3102 word2 = word1;
3103 if (word0) {
3104 word0->next = word1->next;
3105 } else {
3106 pool->setPool(baseIdx, word1->next);
3107 }
3108 word1 = word1->next;
3109 word2->next = nullptr;
3110 blk->addWord(word2);
3111 found = true;
3112 } else {
3113 word0 = word1;
3114 word1 = word1->next;
3115 }
3116 }
3117 }
3118
3119 // only check for outlying words (the next two chunks of code)
3120 // if we didn't find anything else
3121 if (found) {
3122 continue;
3123 }
3124
3125 // scan down the left side of the block, looking for words
3126 // that are near (but not overlapping) the block; if there are
3127 // three or fewer, add them to the block
3128 n = 0;
3129 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3130 word1 = pool->getPool(baseIdx);
3131 while (word1) {
3132 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3133 && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2))
3134 && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3135 ++n;
3136 break;
3137 }
3138 word1 = word1->next;
3139 }
3140 }
3141 if (n > 0 && n <= 3) {
3142 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3143 word0 = nullptr;
3144 word1 = pool->getPool(baseIdx);
3145 while (word1) {
3146 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3147 && ((rot == 0 || rot == 2) ? (word1->xMax <= blk->xMin && word1->xMax > blk->xMin - colSpace2) : (word1->yMax <= blk->yMin && word1->yMax > blk->yMin - colSpace2))
3148 && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3149 word2 = word1;
3150 if (word0) {
3151 word0->next = word1->next;
3152 } else {
3153 pool->setPool(baseIdx, word1->next);
3154 }
3155 word1 = word1->next;
3156 word2->next = nullptr;
3157 blk->addWord(word2);
3158 if (word2->base < minBase) {
3159 minBase = word2->base;
3160 } else if (word2->base > maxBase) {
3161 maxBase = word2->base;
3162 }
3163 found = true;
3164 break;
3165 } else {
3166 word0 = word1;
3167 word1 = word1->next;
3168 }
3169 }
3170 }
3171 }
3172
3173 // scan down the right side of the block, looking for words
3174 // that are near (but not overlapping) the block; if there are
3175 // three or fewer, add them to the block
3176 n = 0;
3177 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3178 word1 = pool->getPool(baseIdx);
3179 while (word1) {
3180 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3181 && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2))
3182 && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3183 ++n;
3184 break;
3185 }
3186 word1 = word1->next;
3187 }
3188 }
3189 if (n > 0 && n <= 3) {
3190 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace); baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace); ++baseIdx) {
3191 word0 = nullptr;
3192 word1 = pool->getPool(baseIdx);
3193 while (word1) {
3194 if (word1->base >= minBase - intraLineSpace && word1->base <= maxBase + intraLineSpace
3195 && ((rot == 0 || rot == 2) ? (word1->xMin >= blk->xMax && word1->xMin < blk->xMax + colSpace2) : (word1->yMin >= blk->yMax && word1->yMin < blk->yMax + colSpace2))
3196 && fabs(word1->fontSize - fontSize) < maxBlockFontSizeDelta3 * fontSize) {
3197 word2 = word1;
3198 if (word0) {
3199 word0->next = word1->next;
3200 } else {
3201 pool->setPool(baseIdx, word1->next);
3202 }
3203 word1 = word1->next;
3204 word2->next = nullptr;
3205 blk->addWord(word2);
3206 if (word2->base < minBase) {
3207 minBase = word2->base;
3208 } else if (word2->base > maxBase) {
3209 maxBase = word2->base;
3210 }
3211 found = true;
3212 break;
3213 } else {
3214 word0 = word1;
3215 word1 = word1->next;
3216 }
3217 }
3218 }
3219 }
3220
3221 } while (found);
3222
3223 //~ need to compute the primary writing mode (horiz/vert) in
3224 //~ addition to primary rotation
3225
3226 // coalesce the block, and add it to the list
3227 blk->coalesce(uMap, fixedPitch);
3228 if (lastBlk) {
3229 lastBlk->next = blk;
3230 } else {
3231 blkList = blk;
3232 }
3233 lastBlk = blk;
3234 count[rot] += blk->charCount;
3235 ++nBlocks;
3236 }
3237
3238 if (count[rot] > count[primaryRot]) {
3239 primaryRot = rot;
3240 }
3241 }
3242
3243 #if 0 // for debugging
3244 printf("*** rotation ***\n");
3245 for (rot = 0; rot < 4; ++rot) {
3246 printf(" %d: %6d\n", rot, count[rot]);
3247 }
3248 printf(" primary rot = %d\n", primaryRot);
3249 printf("\n");
3250 #endif
3251
3252 #if 0 // for debugging
3253 printf("*** blocks ***\n");
3254 for (blk = blkList; blk; blk = blk->next) {
3255 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
3256 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
3257 for (line = blk->lines; line; line = line->next) {
3258 printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
3259 line->xMin, line->xMax, line->yMin, line->yMax, line->base);
3260 for (word0 = line->words; word0; word0 = word0->next) {
3261 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3262 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3263 word0->base, word0->fontSize, word0->spaceAfter);
3264 for (i = 0; i < word0->len; ++i) {
3265 fputc(word0->text[i] & 0xff, stdout);
3266 }
3267 printf("'\n");
3268 }
3269 }
3270 }
3271 printf("\n");
3272 #endif
3273
3274 // determine the primary direction
3275 lrCount = 0;
3276 for (blk = blkList; blk; blk = blk->next) {
3277 for (line = blk->lines; line; line = line->next) {
3278 for (word0 = line->words; word0; word0 = word0->next) {
3279 for (int i = 0; i < word0->len; ++i) {
3280 if (unicodeTypeL(word0->text[i])) {
3281 ++lrCount;
3282 } else if (unicodeTypeR(word0->text[i])) {
3283 --lrCount;
3284 }
3285 }
3286 }
3287 }
3288 }
3289 primaryLR = lrCount >= 0;
3290
3291 #if 0 // for debugging
3292 printf("*** direction ***\n");
3293 printf("lrCount = %d\n", lrCount);
3294 printf("primaryLR = %d\n", primaryLR);
3295 #endif
3296
3297 //----- column assignment
3298
3299 // sort blocks into xy order for column assignment
3300 if (blocks)
3301 gfree(blocks);
3302 if (physLayout && fixedPitch) {
3303
3304 blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
3305 int i;
3306 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
3307 blocks[i] = blk;
3308 col1 = 0; // make gcc happy
3309 switch (primaryRot) {
3310 case 0:
3311 col1 = (int)(blk->xMin / fixedPitch + 0.5);
3312 break;
3313 case 1:
3314 col1 = (int)(blk->yMin / fixedPitch + 0.5);
3315 break;
3316 case 2:
3317 col1 = (int)((pageWidth - blk->xMax) / fixedPitch + 0.5);
3318 break;
3319 case 3:
3320 col1 = (int)((pageHeight - blk->yMax) / fixedPitch + 0.5);
3321 break;
3322 }
3323 blk->col = col1;
3324 for (line = blk->lines; line; line = line->next) {
3325 for (j = 0; j <= line->len; ++j) {
3326 line->col[j] += col1;
3327 }
3328 }
3329 }
3330
3331 } else {
3332
3333 // sort blocks into xy order for column assignment
3334 blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
3335 int i;
3336 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
3337 blocks[i] = blk;
3338 }
3339 if (blocks) {
3340 qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
3341 }
3342
3343 // column assignment
3344 for (i = 0; i < nBlocks; ++i) {
3345 blk0 = blocks[i];
3346 col1 = 0;
3347 for (j = 0; j < i; ++j) {
3348 blk1 = blocks[j];
3349 col2 = 0; // make gcc happy
3350 switch (primaryRot) {
3351 case 0:
3352 if (blk0->xMin > blk1->xMax) {
3353 col2 = blk1->col + blk1->nColumns + 3;
3354 } else if (blk1->xMax == blk1->xMin) {
3355 col2 = blk1->col;
3356 } else {
3357 col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) / (blk1->xMax - blk1->xMin)) * blk1->nColumns);
3358 }
3359 break;
3360 case 1:
3361 if (blk0->yMin > blk1->yMax) {
3362 col2 = blk1->col + blk1->nColumns + 3;
3363 } else if (blk1->yMax == blk1->yMin) {
3364 col2 = blk1->col;
3365 } else {
3366 col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) / (blk1->yMax - blk1->yMin)) * blk1->nColumns);
3367 }
3368 break;
3369 case 2:
3370 if (blk0->xMax < blk1->xMin) {
3371 col2 = blk1->col + blk1->nColumns + 3;
3372 } else if (blk1->xMin == blk1->xMax) {
3373 col2 = blk1->col;
3374 } else {
3375 col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) / (blk1->xMin - blk1->xMax)) * blk1->nColumns);
3376 }
3377 break;
3378 case 3:
3379 if (blk0->yMax < blk1->yMin) {
3380 col2 = blk1->col + blk1->nColumns + 3;
3381 } else if (blk1->yMin == blk1->yMax) {
3382 col2 = blk1->col;
3383 } else {
3384 col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) / (blk1->yMin - blk1->yMax)) * blk1->nColumns);
3385 }
3386 break;
3387 }
3388 if (col2 > col1) {
3389 col1 = col2;
3390 }
3391 }
3392 blk0->col = col1;
3393 for (line = blk0->lines; line; line = line->next) {
3394 for (j = 0; j <= line->len; ++j) {
3395 line->col[j] += col1;
3396 }
3397 }
3398 }
3399 }
3400
3401 #if 0 // for debugging
3402 printf("*** blocks, after column assignment ***\n");
3403 for (blk = blkList; blk; blk = blk->next) {
3404 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
3405 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
3406 blk->nColumns);
3407 for (line = blk->lines; line; line = line->next) {
3408 printf(" line: col[0]=%d\n", line->col[0]);
3409 for (word0 = line->words; word0; word0 = word0->next) {
3410 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3411 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3412 word0->base, word0->fontSize, word0->spaceAfter);
3413 for (i = 0; i < word0->len; ++i) {
3414 fputc(word0->text[i] & 0xff, stdout);
3415 }
3416 printf("'\n");
3417 }
3418 }
3419 }
3420 printf("\n");
3421 #endif
3422
3423 //----- reading order sort
3424
3425 // compute space on left and right sides of each block
3426 for (int i = 0; i < nBlocks; ++i) {
3427 blk0 = blocks[i];
3428 for (j = 0; j < nBlocks; ++j) {
3429 blk1 = blocks[j];
3430 if (blk1 != blk0) {
3431 blk0->updatePriMinMax(blk1);
3432 }
3433 }
3434 }
3435
3436 #if 0 // for debugging
3437 printf("PAGE\n");
3438 #endif
3439
3440 int sortPos = 0;
3441 bool *visited = (bool *)gmallocn(nBlocks, sizeof(bool));
3442 for (int i = 0; i < nBlocks; i++) {
3443 visited[i] = false;
3444 }
3445
3446 double bxMin0, byMin0, bxMin1, byMin1;
3447 int numTables = 0;
3448 int tableId = -1;
3449 int correspondenceX, correspondenceY;
3450 double xCentre1, yCentre1, xCentre2, yCentre2;
3451 double xCentre3, yCentre3, xCentre4, yCentre4;
3452 double deltaX, deltaY;
3453 TextBlock *fblk2 = nullptr, *fblk3 = nullptr, *fblk4 = nullptr;
3454
3455 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3456 blk1->ExMin = blk1->xMin;
3457 blk1->ExMax = blk1->xMax;
3458 blk1->EyMin = blk1->yMin;
3459 blk1->EyMax = blk1->yMax;
3460
3461 bxMin0 = DBL_MAX;
3462 byMin0 = DBL_MAX;
3463 bxMin1 = DBL_MAX;
3464 byMin1 = DBL_MAX;
3465
3466 fblk2 = nullptr;
3467 fblk3 = nullptr;
3468 fblk4 = nullptr;
3469
3470 /* find fblk2, fblk3 and fblk4 so that
3471 * fblk2 is on the right of blk1 and overlap with blk1 in y axis
3472 * fblk3 is under blk1 and overlap with blk1 in x axis
3473 * fblk4 is under blk1 and on the right of blk1
3474 * and they are closest to blk1
3475 */
3476 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3477 if (blk2 != blk1) {
3478 if (blk2->yMin <= blk1->yMax && blk2->yMax >= blk1->yMin && blk2->xMin > blk1->xMax && blk2->xMin < bxMin0) {
3479 bxMin0 = blk2->xMin;
3480 fblk2 = blk2;
3481 } else if (blk2->xMin <= blk1->xMax && blk2->xMax >= blk1->xMin && blk2->yMin > blk1->yMax && blk2->yMin < byMin0) {
3482 byMin0 = blk2->yMin;
3483 fblk3 = blk2;
3484 } else if (blk2->xMin > blk1->xMax && blk2->xMin < bxMin1 && blk2->yMin > blk1->yMax && blk2->yMin < byMin1) {
3485 bxMin1 = blk2->xMin;
3486 byMin1 = blk2->yMin;
3487 fblk4 = blk2;
3488 }
3489 }
3490 }
3491
3492 /* fblk4 can not overlap with fblk3 in x and with fblk2 in y
3493 * fblk2 can not overlap with fblk3 in x and y
3494 * fblk4 has to overlap with fblk3 in y and with fblk2 in x
3495 */
3496 if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) {
3497 if (((fblk3->xMin <= fblk4->xMax && fblk3->xMax >= fblk4->xMin) || (fblk2->yMin <= fblk4->yMax && fblk2->yMax >= fblk4->yMin) || (fblk2->xMin <= fblk3->xMax && fblk2->xMax >= fblk3->xMin)
3498 || (fblk2->yMin <= fblk3->yMax && fblk2->yMax >= fblk3->yMin))
3499 || !(fblk4->xMin <= fblk2->xMax && fblk4->xMax >= fblk2->xMin && fblk4->yMin <= fblk3->yMax && fblk4->yMax >= fblk3->yMin)) {
3500 fblk2 = nullptr;
3501 fblk3 = nullptr;
3502 fblk4 = nullptr;
3503 }
3504 }
3505
3506 // if we found any then look whether they form a table
3507 if (fblk2 != nullptr && fblk3 != nullptr && fblk4 != nullptr) {
3508 tableId = -1;
3509 correspondenceX = 0;
3510 correspondenceY = 0;
3511 deltaX = 0.0;
3512 deltaY = 0.0;
3513
3514 if (blk1->lines && blk1->lines->words)
3515 deltaX = blk1->lines->words->getFontSize();
3516 if (fblk2->lines && fblk2->lines->words)
3517 deltaX = deltaX < fblk2->lines->words->getFontSize() ? deltaX : fblk2->lines->words->getFontSize();
3518 if (fblk3->lines && fblk3->lines->words)
3519 deltaX = deltaX < fblk3->lines->words->getFontSize() ? deltaX : fblk3->lines->words->getFontSize();
3520 if (fblk4->lines && fblk4->lines->words)
3521 deltaX = deltaX < fblk4->lines->words->getFontSize() ? deltaX : fblk4->lines->words->getFontSize();
3522
3523 deltaY = deltaX;
3524
3525 deltaX *= minColSpacing1;
3526 deltaY *= maxIntraLineDelta;
3527
3528 xCentre1 = (blk1->xMax + blk1->xMin) / 2.0;
3529 yCentre1 = (blk1->yMax + blk1->yMin) / 2.0;
3530 xCentre2 = (fblk2->xMax + fblk2->xMin) / 2.0;
3531 yCentre2 = (fblk2->yMax + fblk2->yMin) / 2.0;
3532 xCentre3 = (fblk3->xMax + fblk3->xMin) / 2.0;
3533 yCentre3 = (fblk3->yMax + fblk3->yMin) / 2.0;
3534 xCentre4 = (fblk4->xMax + fblk4->xMin) / 2.0;
3535 yCentre4 = (fblk4->yMax + fblk4->yMin) / 2.0;
3536
3537 // are blocks centrally aligned in x ?
3538 if (fabs(xCentre1 - xCentre3) <= deltaX && fabs(xCentre2 - xCentre4) <= deltaX)
3539 correspondenceX++;
3540
3541 // are blocks centrally aligned in y ?
3542 if (fabs(yCentre1 - yCentre2) <= deltaY && fabs(yCentre3 - yCentre4) <= deltaY)
3543 correspondenceY++;
3544
3545 // are blocks aligned to the left ?
3546 if (fabs(blk1->xMin - fblk3->xMin) <= deltaX && fabs(fblk2->xMin - fblk4->xMin) <= deltaX)
3547 correspondenceX++;
3548
3549 // are blocks aligned to the right ?
3550 if (fabs(blk1->xMax - fblk3->xMax) <= deltaX && fabs(fblk2->xMax - fblk4->xMax) <= deltaX)
3551 correspondenceX++;
3552
3553 // are blocks aligned to the top ?
3554 if (fabs(blk1->yMin - fblk2->yMin) <= deltaY && fabs(fblk3->yMin - fblk4->yMin) <= deltaY)
3555 correspondenceY++;
3556
3557 // are blocks aligned to the bottom ?
3558 if (fabs(blk1->yMax - fblk2->yMax) <= deltaY && fabs(fblk3->yMax - fblk4->yMax) <= deltaY)
3559 correspondenceY++;
3560
3561 // are blocks aligned in x and y ?
3562 if (correspondenceX > 0 && correspondenceY > 0) {
3563
3564 // find maximal tableId
3565 tableId = tableId < fblk4->tableId ? fblk4->tableId : tableId;
3566 tableId = tableId < fblk3->tableId ? fblk3->tableId : tableId;
3567 tableId = tableId < fblk2->tableId ? fblk2->tableId : tableId;
3568 tableId = tableId < blk1->tableId ? blk1->tableId : tableId;
3569
3570 // if the tableId is -1, then we found new table
3571 if (tableId < 0) {
3572 tableId = numTables;
3573 numTables++;
3574 }
3575
3576 blk1->tableId = tableId;
3577 fblk2->tableId = tableId;
3578 fblk3->tableId = tableId;
3579 fblk4->tableId = tableId;
3580 }
3581 }
3582 }
3583
3584 /* set extended bounding boxes of all table entries
3585 * so that they contain whole table
3586 * (we need to process whole table size when comparing it
3587 * with regular text blocks)
3588 */
3589 PDFRectangle *envelopes = new PDFRectangle[numTables];
3590 TextBlock **ending_blocks = new TextBlock *[numTables];
3591
3592 for (int i = 0; i < numTables; i++) {
3593 envelopes[i].x1 = DBL_MAX;
3594 envelopes[i].x2 = DBL_MIN;
3595 envelopes[i].y1 = DBL_MAX;
3596 envelopes[i].y2 = DBL_MIN;
3597 }
3598
3599 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3600 if (blk1->tableId >= 0) {
3601 if (blk1->ExMin < envelopes[blk1->tableId].x1) {
3602 envelopes[blk1->tableId].x1 = blk1->ExMin;
3603 if (!blk1->page->primaryLR)
3604 ending_blocks[blk1->tableId] = blk1;
3605 }
3606
3607 if (blk1->ExMax > envelopes[blk1->tableId].x2) {
3608 envelopes[blk1->tableId].x2 = blk1->ExMax;
3609 if (blk1->page->primaryLR)
3610 ending_blocks[blk1->tableId] = blk1;
3611 }
3612
3613 envelopes[blk1->tableId].y1 = blk1->EyMin < envelopes[blk1->tableId].y1 ? blk1->EyMin : envelopes[blk1->tableId].y1;
3614 envelopes[blk1->tableId].y2 = blk1->EyMax > envelopes[blk1->tableId].y2 ? blk1->EyMax : envelopes[blk1->tableId].y2;
3615 }
3616 }
3617
3618 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3619 if (blk1->tableId >= 0 && blk1->xMin <= ending_blocks[blk1->tableId]->xMax && blk1->xMax >= ending_blocks[blk1->tableId]->xMin) {
3620 blk1->tableEnd = true;
3621 }
3622 }
3623
3624 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3625 if (blk1->tableId >= 0) {
3626 blk1->ExMin = envelopes[blk1->tableId].x1;
3627 blk1->ExMax = envelopes[blk1->tableId].x2;
3628 blk1->EyMin = envelopes[blk1->tableId].y1;
3629 blk1->EyMax = envelopes[blk1->tableId].y2;
3630 }
3631 }
3632 delete[] envelopes;
3633 delete[] ending_blocks;
3634
3635 /* set extended bounding boxes of all other blocks
3636 * so that they extend in x without hitting neighbours
3637 */
3638 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3639 if (!(blk1->tableId >= 0)) {
3640 double xMax = DBL_MAX;
3641 double xMin = DBL_MIN;
3642
3643 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3644 if (blk2 == blk1)
3645 continue;
3646
3647 if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) {
3648 if (blk2->xMin < xMax && blk2->xMin > blk1->xMax)
3649 xMax = blk2->xMin;
3650
3651 if (blk2->xMax > xMin && blk2->xMax < blk1->xMin)
3652 xMin = blk2->xMax;
3653 }
3654 }
3655
3656 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3657 if (blk2 == blk1)
3658 continue;
3659
3660 if (blk2->xMax > blk1->ExMax && blk2->xMax <= xMax && blk2->yMin >= blk1->yMax) {
3661 blk1->ExMax = blk2->xMax;
3662 }
3663
3664 if (blk2->xMin < blk1->ExMin && blk2->xMin >= xMin && blk2->yMin >= blk1->yMax)
3665 blk1->ExMin = blk2->xMin;
3666 }
3667 }
3668 }
3669
3670 int i = -1;
3671 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3672 i++;
3673 sortPos = blk1->visitDepthFirst(blkList, i, blocks, sortPos, visited);
3674 }
3675 if (visited) {
3676 gfree(visited);
3677 }
3678
3679 #if 0 // for debugging
3680 printf("*** blocks, after ro sort ***\n");
3681 for (i = 0; i < nBlocks; ++i) {
3682 blk = blocks[i];
3683 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
3684 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
3685 blk->priMin, blk->priMax);
3686 for (line = blk->lines; line; line = line->next) {
3687 printf(" line:\n");
3688 for (word0 = line->words; word0; word0 = word0->next) {
3689 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3690 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3691 word0->base, word0->fontSize, word0->spaceAfter);
3692 for (j = 0; j < word0->len; ++j) {
3693 fputc(word0->text[j] & 0xff, stdout);
3694 }
3695 printf("'\n");
3696 }
3697 }
3698 }
3699 printf("\n");
3700 fflush(stdout);
3701 #endif
3702
3703 // build the flows
3704 //~ this needs to be adjusted for writing mode (vertical text)
3705 //~ this also needs to account for right-to-left column ordering
3706 while (flows) {
3707 flow = flows;
3708 flows = flows->next;
3709 delete flow;
3710 }
3711 flow = nullptr;
3712 flows = lastFlow = nullptr;
3713 // assume blocks are already in reading order,
3714 // and construct flows accordingly.
3715 for (i = 0; i < nBlocks; i++) {
3716 blk = blocks[i];
3717 blk->next = nullptr;
3718 if (flow) {
3719 blk1 = blocks[i - 1];
3720 blkSpace = maxBlockSpacing * blk1->lines->words->fontSize;
3721 if (blk1->secondaryDelta(blk) <= blkSpace && blk->isBelow(blk1) && flow->blockFits(blk, blk1)) {
3722 flow->addBlock(blk);
3723 continue;
3724 }
3725 }
3726 flow = new TextFlow(this, blk);
3727 if (lastFlow) {
3728 lastFlow->next = flow;
3729 } else {
3730 flows = flow;
3731 }
3732 lastFlow = flow;
3733 }
3734
3735 #if 0 // for debugging
3736 printf("*** flows ***\n");
3737 for (flow = flows; flow; flow = flow->next) {
3738 printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
3739 flow->xMin, flow->xMax, flow->yMin, flow->yMax,
3740 flow->priMin, flow->priMax);
3741 for (blk = flow->blocks; blk; blk = blk->next) {
3742 printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
3743 blk->rot, blk->ExMin, blk->ExMax, blk->EyMin, blk->EyMax,
3744 blk->priMin, blk->priMax);
3745 for (line = blk->lines; line; line = line->next) {
3746 printf(" line:\n");
3747 for (word0 = line->words; word0; word0 = word0->next) {
3748 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3749 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3750 word0->base, word0->fontSize, word0->spaceAfter);
3751 for (i = 0; i < word0->len; ++i) {
3752 fputc(word0->text[i] & 0xff, stdout);
3753 }
3754 printf("'\n");
3755 }
3756 }
3757 }
3758 }
3759 printf("\n");
3760 #endif
3761 }
3762
adjustRotation(TextLine * line,int start,int end,double * xMin,double * xMax,double * yMin,double * yMax)3763 void TextPage::adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax)
3764 {
3765 switch (line->rot) {
3766 case 0:
3767 *xMin = line->edge[start];
3768 *xMax = line->edge[end];
3769 *yMin = line->yMin;
3770 *yMax = line->yMax;
3771 break;
3772 case 1:
3773 *xMin = line->xMin;
3774 *xMax = line->xMax;
3775 *yMin = line->edge[start];
3776 *yMax = line->edge[end];
3777 break;
3778 case 2:
3779 *xMin = line->edge[end];
3780 *xMax = line->edge[start];
3781 *yMin = line->yMin;
3782 *yMax = line->yMax;
3783 break;
3784 case 3:
3785 *xMin = line->xMin;
3786 *xMax = line->xMax;
3787 *yMin = line->edge[end];
3788 *yMax = line->edge[start];
3789 break;
3790 }
3791 }
3792
findText(const Unicode * s,int len,bool startAtTop,bool stopAtBottom,bool startAtLast,bool stopAtLast,bool caseSensitive,bool backward,bool wholeWord,double * xMin,double * yMin,double * xMax,double * yMax)3793 bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax)
3794 {
3795 return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, false, false, backward, wholeWord, xMin, yMin, xMax, yMax, nullptr, nullptr);
3796 }
3797
findText(const Unicode * s,int len,bool startAtTop,bool stopAtBottom,bool startAtLast,bool stopAtLast,bool caseSensitive,bool ignoreDiacritics,bool backward,bool wholeWord,double * xMin,double * yMin,double * xMax,double * yMax)3798 bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax,
3799 double *yMax)
3800 {
3801 return findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, ignoreDiacritics, false, backward, wholeWord, xMin, yMin, xMax, yMax, nullptr, nullptr);
3802 }
3803
findText(const Unicode * s,int len,bool startAtTop,bool stopAtBottom,bool startAtLast,bool stopAtLast,bool caseSensitive,bool ignoreDiacritics,bool matchAcrossLines,bool backward,bool wholeWord,double * xMin,double * yMin,double * xMax,double * yMax,PDFRectangle * continueMatch,bool * ignoredHyphen)3804 bool TextPage::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin,
3805 double *yMin, double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen)
3806 {
3807 TextBlock *blk;
3808 TextLine *line;
3809 Unicode *s2, *txt, *reordered;
3810 Unicode *p;
3811 Unicode *nextline;
3812 int nextline_len;
3813 bool nextlineAfterHyphen = false;
3814 int txtSize, m, i, j, k;
3815 double xStart, yStart, xStop, yStop;
3816 double xMin0, yMin0, xMax0, yMax0;
3817 double xMin1, yMin1, xMax1, yMax1;
3818 double xMin2, yMin2, xMax2, yMax2;
3819 bool found;
3820
3821 if (len == 0) {
3822 return false;
3823 }
3824
3825 if (rawOrder) {
3826 return false;
3827 }
3828
3829 if (matchAcrossLines && backward) {
3830 // matchAcrossLines is unimplemented for backward search
3831 matchAcrossLines = false;
3832 }
3833
3834 // handle right-to-left text
3835 reordered = (Unicode *)gmallocn(len, sizeof(Unicode));
3836 reorderText(s, len, nullptr, primaryLR, nullptr, reordered);
3837
3838 // normalize the search string
3839 s2 = unicodeNormalizeNFKC(reordered, len, &len, nullptr);
3840
3841 // if search string is not pure ascii then don't
3842 // use ignoreDiacritics (as they won't match)
3843 if (!caseSensitive) {
3844 // convert the search string to uppercase
3845 for (i = 0; i < len; ++i) {
3846 s2[i] = unicodeToUpper(s2[i]);
3847 if (ignoreDiacritics && !isAscii7(s2[i]))
3848 ignoreDiacritics = false;
3849 }
3850 } else if (ignoreDiacritics) {
3851 for (i = 0; i < len; ++i) {
3852 if (!isAscii7(s2[i])) {
3853 ignoreDiacritics = false;
3854 break;
3855 }
3856 }
3857 }
3858
3859 txt = nullptr;
3860 txtSize = 0;
3861
3862 xStart = yStart = xStop = yStop = 0;
3863 if (startAtLast && haveLastFind) {
3864 xStart = lastFindXMin;
3865 yStart = lastFindYMin;
3866 } else if (!startAtTop) {
3867 xStart = *xMin;
3868 yStart = *yMin;
3869 }
3870 if (stopAtLast && haveLastFind) {
3871 xStop = lastFindXMin;
3872 yStop = lastFindYMin;
3873 } else if (!stopAtBottom) {
3874 xStop = *xMax;
3875 yStop = *yMax;
3876 }
3877
3878 found = false;
3879 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3880 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3881
3882 for (i = backward ? nBlocks - 1 : 0; backward ? i >= 0 : i < nBlocks; i += backward ? -1 : 1) {
3883 blk = blocks[i];
3884
3885 // check: is the block above the top limit?
3886 // (this only works if the page's primary rotation is zero --
3887 // otherwise the blocks won't be sorted in the useful order)
3888 if (!startAtTop && primaryRot == 0 && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
3889 continue;
3890 }
3891
3892 // check: is the block below the bottom limit?
3893 // (this only works if the page's primary rotation is zero --
3894 // otherwise the blocks won't be sorted in the useful order)
3895 if (!stopAtBottom && primaryRot == 0 && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
3896 break;
3897 }
3898
3899 for (line = blk->lines; line; line = line->next) {
3900
3901 // check: is the line above the top limit?
3902 // (this only works if the page's primary rotation is zero --
3903 // otherwise the lines won't be sorted in the useful order)
3904 if (!startAtTop && primaryRot == 0 && (backward ? line->yMin > yStart : line->yMin < yStart)) {
3905 continue;
3906 }
3907
3908 // check: is the line below the bottom limit?
3909 // (this only works if the page's primary rotation is zero --
3910 // otherwise the lines won't be sorted in the useful order)
3911 if (!stopAtBottom && primaryRot == 0 && (backward ? line->yMin < yStop : line->yMin > yStop)) {
3912 continue;
3913 }
3914
3915 if (!line->normalized)
3916 line->normalized = unicodeNormalizeNFKC(line->text, line->len, &line->normalized_len, &line->normalized_idx, true);
3917
3918 if (matchAcrossLines && line->next && !line->next->normalized)
3919 line->next->normalized = unicodeNormalizeNFKC(line->next->text, line->next->len, &line->next->normalized_len, &line->next->normalized_idx, true);
3920 nextline = nullptr;
3921 nextline_len = 0;
3922
3923 // convert the line to uppercase
3924 m = line->normalized_len;
3925
3926 if (ignoreDiacritics) {
3927 if (!line->ascii_translation)
3928 unicodeToAscii7(line->normalized, line->normalized_len, &line->ascii_translation, &line->ascii_len, line->normalized_idx, &line->ascii_idx);
3929 if (line->ascii_len)
3930 m = line->ascii_len;
3931 else
3932 ignoreDiacritics = false;
3933
3934 if (matchAcrossLines && line->next && !line->next->ascii_translation)
3935 unicodeToAscii7(line->next->normalized, line->next->normalized_len, &line->next->ascii_translation, &line->next->ascii_len, line->next->normalized_idx, &line->next->ascii_idx);
3936 }
3937 if (!caseSensitive) {
3938 if (m > txtSize) {
3939 txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
3940 txtSize = m;
3941 }
3942 for (k = 0; k < m; ++k) {
3943 if (ignoreDiacritics)
3944 txt[k] = unicodeToUpper(line->ascii_translation[k]);
3945 else
3946 txt[k] = unicodeToUpper(line->normalized[k]);
3947 }
3948 if (matchAcrossLines && line->next) {
3949 nextline_len = ignoreDiacritics ? line->next->ascii_len : line->next->normalized_len;
3950 nextline = (Unicode *)gmallocn(nextline_len, sizeof(Unicode));
3951 for (k = 0; k < nextline_len; ++k) {
3952 nextline[k] = ignoreDiacritics ? unicodeToUpper(line->next->ascii_translation[k]) : unicodeToUpper(line->next->normalized[k]);
3953 }
3954 }
3955 } else {
3956 if (ignoreDiacritics)
3957 txt = line->ascii_translation;
3958 else
3959 txt = line->normalized;
3960
3961 if (matchAcrossLines && line->next) {
3962 nextline_len = ignoreDiacritics ? line->next->ascii_len : line->next->normalized_len;
3963 nextline = ignoreDiacritics ? line->next->ascii_translation : line->next->normalized;
3964 }
3965 }
3966
3967 // search each position in this line
3968 j = backward ? m - len : 0;
3969 p = txt + j;
3970 while (backward ? j >= 0 : j <= m - (nextline ? 1 : len)) {
3971 bool wholeWordStartIsOk, wholeWordEndIsOk;
3972 if (wholeWord) {
3973 wholeWordStartIsOk = j == 0 || !unicodeTypeAlphaNum(txt[j - 1]);
3974 if (nextline)
3975 wholeWordEndIsOk = true; // word end may be in next line, so we'll check it later
3976 else
3977 wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(txt[j + len]);
3978 }
3979 if (!wholeWord || (wholeWordStartIsOk && wholeWordEndIsOk)) {
3980 int n = 0;
3981 bool spaceConsumedByNewline = false;
3982 bool found_it;
3983
3984 // compare the strings
3985 for (k = 0; k < len; ++k) {
3986 bool last_char_of_line = j + k == m - 1;
3987 bool last_char_of_search_term = k == len - 1;
3988
3989 if (p[k] != s2[k] || (nextline && last_char_of_line && !last_char_of_search_term)) {
3990 // now check if the comparison failed at the end-of-line hyphen,
3991 // and if so, keep on comparing at the next line
3992 nextlineAfterHyphen = false;
3993
3994 if (s2[k] == p[k]) {
3995 if (p[k] != (Unicode)'-' && !UnicodeIsWhitespace(s2[k + 1])) {
3996 break;
3997 }
3998 k++;
3999 } else if (p[k] != (Unicode)'-' || UnicodeIsWhitespace(s2[k]))
4000 break;
4001 else
4002 nextlineAfterHyphen = true;
4003
4004 for (; n < nextline_len && k < len; ++k, ++n) {
4005 if (nextline[n] != s2[k]) {
4006 if (!spaceConsumedByNewline && !n && UnicodeIsWhitespace(s2[k])) {
4007 n = -1;
4008 spaceConsumedByNewline = true;
4009 continue;
4010 }
4011 break;
4012 }
4013 }
4014 break;
4015 }
4016 }
4017
4018 found_it = k == len;
4019 if (found_it && nextline && wholeWord) { // check word end for nextline case
4020 if (n) // Match ended at next line
4021 wholeWordEndIsOk = n == nextline_len || !unicodeTypeAlphaNum(nextline[n]);
4022 else // Match ended on same line
4023 wholeWordEndIsOk = j + len == m || !unicodeTypeAlphaNum(txt[j + len]);
4024
4025 if (!wholeWordEndIsOk)
4026 found_it = false;
4027 }
4028 // found it
4029 if (found_it) {
4030 bool nextLineMatch = (bool)n;
4031 if (spaceConsumedByNewline)
4032 k--;
4033 // where s2 matches a subsequence of a compatibility equivalence
4034 // decomposition, highlight the entire glyph, since we don't know
4035 // the internal layout of subglyph components
4036 int normStart, normAfterEnd;
4037 if (ignoreDiacritics) {
4038 normStart = line->ascii_idx[j];
4039 if (nextline)
4040 normAfterEnd = line->ascii_idx[j + k - n];
4041 else
4042 normAfterEnd = line->ascii_idx[j + len - 1] + 1;
4043 } else {
4044 normStart = line->normalized_idx[j];
4045 if (nextline)
4046 normAfterEnd = line->normalized_idx[j + k - n];
4047 else
4048 normAfterEnd = line->normalized_idx[j + len - 1] + 1;
4049 }
4050
4051 adjustRotation(line, normStart, normAfterEnd, &xMin1, &xMax1, &yMin1, &yMax1);
4052
4053 if (backward) {
4054 if ((startAtTop || yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) && (stopAtBottom || yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
4055 if (!found || yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
4056 xMin0 = xMin1;
4057 xMax0 = xMax1;
4058 yMin0 = yMin1;
4059 yMax0 = yMax1;
4060 found = true;
4061 }
4062 }
4063 } else {
4064 if ((startAtTop || yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) && (stopAtBottom || yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
4065 if (!found || yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
4066 xMin0 = xMin1;
4067 xMax0 = xMax1;
4068 yMin0 = yMin1;
4069 yMax0 = yMax1;
4070 found = true;
4071 if (nextLineMatch) { // set the out parameters
4072 if (ignoredHyphen)
4073 *ignoredHyphen = nextlineAfterHyphen;
4074
4075 if (continueMatch) {
4076 adjustRotation(line->next, 0, n, &xMin2, &xMax2, &yMin2, &yMax2);
4077 continueMatch->x1 = xMin2;
4078 continueMatch->y1 = yMax2;
4079 continueMatch->x2 = xMax2;
4080 continueMatch->y2 = yMin2;
4081 }
4082 }
4083 }
4084 }
4085 }
4086 }
4087 }
4088 if (backward) {
4089 --j;
4090 --p;
4091 } else {
4092 ++j;
4093 ++p;
4094 }
4095 }
4096
4097 if (nextline && nextline != line->next->ascii_translation && nextline != line->next->normalized) {
4098 gfree(nextline);
4099 }
4100 }
4101 }
4102
4103 gfree(s2);
4104 gfree(reordered);
4105 if (!caseSensitive) {
4106 gfree(txt);
4107 }
4108
4109 if (found) {
4110 *xMin = xMin0;
4111 *xMax = xMax0;
4112 *yMin = yMin0;
4113 *yMax = yMax0;
4114 lastFindXMin = xMin0;
4115 lastFindYMin = yMin0;
4116 haveLastFind = true;
4117 return true;
4118 }
4119
4120 return false;
4121 }
4122
getText(double xMin,double yMin,double xMax,double yMax,EndOfLineKind textEOL) const4123 GooString *TextPage::getText(double xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const
4124 {
4125 GooString *s;
4126 const UnicodeMap *uMap;
4127 TextBlock *blk;
4128 TextLine *line;
4129 TextLineFrag *frags;
4130 int nFrags, fragsSize;
4131 TextLineFrag *frag;
4132 char space[8], eol[16];
4133 int spaceLen, eolLen;
4134 int lastRot;
4135 double x, y, delta;
4136 int col, idx0, idx1, i, j;
4137 bool multiLine, oneRot;
4138
4139 s = new GooString();
4140
4141 // get the output encoding
4142 if (!(uMap = globalParams->getTextEncoding())) {
4143 return s;
4144 }
4145
4146 if (rawOrder) {
4147 TextWord *word;
4148 char mbc[16];
4149 int mbc_len;
4150
4151 for (word = rawWords; word && word <= rawLastWord; word = word->next) {
4152 for (j = 0; j < word->getLength(); ++j) {
4153 double gXMin, gXMax, gYMin, gYMax;
4154 word->getCharBBox(j, &gXMin, &gYMin, &gXMax, &gYMax);
4155 if (xMin <= gXMin && gXMax <= xMax && yMin <= gYMin && gYMax <= yMax) {
4156 mbc_len = uMap->mapUnicode(*(word->getChar(j)), mbc, sizeof(mbc));
4157 s->append(mbc, mbc_len);
4158 }
4159 }
4160 }
4161 return s;
4162 }
4163
4164 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
4165 eolLen = 0; // make gcc happy
4166 switch (textEOL) {
4167 case eolUnix:
4168 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
4169 break;
4170 case eolDOS:
4171 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
4172 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
4173 break;
4174 case eolMac:
4175 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
4176 break;
4177 }
4178
4179 //~ writing mode (horiz/vert)
4180
4181 // collect the line fragments that are in the rectangle
4182 fragsSize = 256;
4183 frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
4184 nFrags = 0;
4185 lastRot = -1;
4186 oneRot = true;
4187 for (i = 0; i < nBlocks; ++i) {
4188 blk = blocks[i];
4189 if (xMin < blk->xMax && blk->xMin < xMax && yMin < blk->yMax && blk->yMin < yMax) {
4190 for (line = blk->lines; line; line = line->next) {
4191 if (xMin < line->xMax && line->xMin < xMax && yMin < line->yMax && line->yMin < yMax) {
4192 idx0 = idx1 = -1;
4193 switch (line->rot) {
4194 case 0:
4195 y = 0.5 * (line->yMin + line->yMax);
4196 if (yMin < y && y < yMax) {
4197 j = 0;
4198 while (j < line->len) {
4199 if (0.5 * (line->edge[j] + line->edge[j + 1]) > xMin) {
4200 idx0 = j;
4201 break;
4202 }
4203 ++j;
4204 }
4205 j = line->len - 1;
4206 while (j >= 0) {
4207 if (0.5 * (line->edge[j] + line->edge[j + 1]) < xMax) {
4208 idx1 = j;
4209 break;
4210 }
4211 --j;
4212 }
4213 }
4214 break;
4215 case 1:
4216 x = 0.5 * (line->xMin + line->xMax);
4217 if (xMin < x && x < xMax) {
4218 j = 0;
4219 while (j < line->len) {
4220 if (0.5 * (line->edge[j] + line->edge[j + 1]) > yMin) {
4221 idx0 = j;
4222 break;
4223 }
4224 ++j;
4225 }
4226 j = line->len - 1;
4227 while (j >= 0) {
4228 if (0.5 * (line->edge[j] + line->edge[j + 1]) < yMax) {
4229 idx1 = j;
4230 break;
4231 }
4232 --j;
4233 }
4234 }
4235 break;
4236 case 2:
4237 y = 0.5 * (line->yMin + line->yMax);
4238 if (yMin < y && y < yMax) {
4239 j = 0;
4240 while (j < line->len) {
4241 if (0.5 * (line->edge[j] + line->edge[j + 1]) < xMax) {
4242 idx0 = j;
4243 break;
4244 }
4245 ++j;
4246 }
4247 j = line->len - 1;
4248 while (j >= 0) {
4249 if (0.5 * (line->edge[j] + line->edge[j + 1]) > xMin) {
4250 idx1 = j;
4251 break;
4252 }
4253 --j;
4254 }
4255 }
4256 break;
4257 case 3:
4258 x = 0.5 * (line->xMin + line->xMax);
4259 if (xMin < x && x < xMax) {
4260 j = 0;
4261 while (j < line->len) {
4262 if (0.5 * (line->edge[j] + line->edge[j + 1]) < yMax) {
4263 idx0 = j;
4264 break;
4265 }
4266 ++j;
4267 }
4268 j = line->len - 1;
4269 while (j >= 0) {
4270 if (0.5 * (line->edge[j] + line->edge[j + 1]) > yMin) {
4271 idx1 = j;
4272 break;
4273 }
4274 --j;
4275 }
4276 }
4277 break;
4278 }
4279 if (idx0 >= 0 && idx1 >= 0) {
4280 if (nFrags == fragsSize) {
4281 fragsSize *= 2;
4282 frags = (TextLineFrag *)greallocn(frags, fragsSize, sizeof(TextLineFrag));
4283 }
4284 frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
4285 ++nFrags;
4286 if (lastRot >= 0 && line->rot != lastRot) {
4287 oneRot = false;
4288 }
4289 lastRot = line->rot;
4290 }
4291 }
4292 }
4293 }
4294 }
4295
4296 // sort the fragments and generate the string
4297 if (nFrags > 0) {
4298
4299 for (i = 0; i < nFrags; ++i) {
4300 frags[i].computeCoords(oneRot);
4301 }
4302 assignColumns(frags, nFrags, oneRot);
4303
4304 // if all lines in the region have the same rotation, use it;
4305 // otherwise, use the page's primary rotation
4306 if (oneRot) {
4307 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXLineRot);
4308 } else {
4309 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
4310 }
4311 i = 0;
4312 while (i < nFrags) {
4313 delta = maxIntraLineDelta * frags[i].line->words->fontSize;
4314 for (j = i + 1; j < nFrags && fabs(frags[j].base - frags[i].base) < delta; ++j)
4315 ;
4316 qsort(frags + i, j - i, sizeof(TextLineFrag), oneRot ? &TextLineFrag::cmpXYColumnLineRot : &TextLineFrag::cmpXYColumnPrimaryRot);
4317 i = j;
4318 }
4319
4320 col = 0;
4321 multiLine = false;
4322 for (i = 0; i < nFrags; ++i) {
4323 frag = &frags[i];
4324
4325 // insert a return
4326 if (frag->col < col || (i > 0 && fabs(frag->base - frags[i - 1].base) > maxIntraLineDelta * frags[i - 1].line->words->fontSize)) {
4327 s->append(eol, eolLen);
4328 col = 0;
4329 multiLine = true;
4330 }
4331
4332 // column alignment
4333 for (; col < frag->col; ++col) {
4334 s->append(space, spaceLen);
4335 }
4336
4337 // get the fragment text
4338 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
4339 }
4340
4341 if (multiLine) {
4342 s->append(eol, eolLen);
4343 }
4344 }
4345
4346 gfree(frags);
4347
4348 return s;
4349 }
4350
4351 class TextSelectionVisitor
4352 {
4353 public:
4354 explicit TextSelectionVisitor(TextPage *page);
4355 virtual ~TextSelectionVisitor();
4356 TextSelectionVisitor(const TextSelectionVisitor &) = delete;
4357 TextSelectionVisitor &operator=(const TextSelectionVisitor &) = delete;
4358 virtual void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) = 0;
4359 virtual void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) = 0;
4360 virtual void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) = 0;
4361
4362 protected:
4363 TextPage *page;
4364 };
4365
TextSelectionVisitor(TextPage * p)4366 TextSelectionVisitor::TextSelectionVisitor(TextPage *p) : page(p) { }
4367
4368 TextSelectionVisitor::~TextSelectionVisitor() = default;
4369
4370 class TextSelectionDumper : public TextSelectionVisitor
4371 {
4372 public:
4373 explicit TextSelectionDumper(TextPage *page);
4374 ~TextSelectionDumper() override;
4375
visitBlock(TextBlock * block,TextLine * begin,TextLine * end,const PDFRectangle * selection)4376 void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {};
4377 void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override;
4378 void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override;
4379 void endPage();
4380
4381 GooString *getText();
4382 std::vector<TextWordSelection *> **takeWordList(int *nLines);
4383
4384 private:
4385 void startLine();
4386 void finishLine();
4387
4388 std::vector<TextWordSelection *> **lines;
4389 int nLines, linesSize;
4390 std::vector<TextWordSelection *> *words;
4391 int tableId;
4392 TextBlock *currentBlock;
4393 };
4394
TextSelectionDumper(TextPage * p)4395 TextSelectionDumper::TextSelectionDumper(TextPage *p) : TextSelectionVisitor(p)
4396 {
4397 linesSize = 256;
4398 lines = (std::vector<TextWordSelection *> **)gmallocn(linesSize, sizeof(std::vector<TextWordSelection *> *));
4399 nLines = 0;
4400
4401 tableId = -1;
4402 currentBlock = nullptr;
4403 words = nullptr;
4404 }
4405
~TextSelectionDumper()4406 TextSelectionDumper::~TextSelectionDumper()
4407 {
4408 for (int i = 0; i < nLines; i++) {
4409 for (auto entry : *(lines[i])) {
4410 delete entry;
4411 }
4412 delete lines[i];
4413 }
4414 gfree(lines);
4415 }
4416
startLine()4417 void TextSelectionDumper::startLine()
4418 {
4419 finishLine();
4420 words = new std::vector<TextWordSelection *>();
4421 }
4422
finishLine()4423 void TextSelectionDumper::finishLine()
4424 {
4425 if (nLines == linesSize) {
4426 linesSize *= 2;
4427 lines = (std::vector<TextWordSelection *> **)grealloc(lines, linesSize * sizeof(std::vector<TextWordSelection *> *));
4428 }
4429
4430 if (words && words->size() > 0) {
4431 // Reverse word order for RTL text. Fixes #53 for glib backend (Evince)
4432 if (!page->primaryLR)
4433 std::reverse(words->begin(), words->end());
4434
4435 lines[nLines++] = words;
4436 } else if (words)
4437 delete words;
4438 words = nullptr;
4439 }
4440
visitLine(TextLine * line,TextWord * begin,TextWord * end,int edge_begin,int edge_end,const PDFRectangle * selection)4441 void TextSelectionDumper::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection)
4442 {
4443 TextLineFrag frag;
4444
4445 frag.init(line, edge_begin, edge_end - edge_begin);
4446
4447 if (tableId >= 0 && frag.line->blk->tableId < 0) {
4448 finishLine();
4449
4450 tableId = -1;
4451 currentBlock = nullptr;
4452 }
4453
4454 if (frag.line->blk->tableId >= 0) { // a table
4455 if (tableId == -1) {
4456 tableId = frag.line->blk->tableId;
4457 currentBlock = frag.line->blk;
4458 }
4459
4460 if (currentBlock == frag.line->blk) { // the same block
4461 startLine();
4462 } else { // another block
4463 if (currentBlock->tableEnd) { // previous block ended its row
4464 startLine();
4465 }
4466 currentBlock = frag.line->blk;
4467 }
4468 } else { // not a table
4469 startLine();
4470 }
4471 }
4472
visitWord(TextWord * word,int begin,int end,const PDFRectangle * selection)4473 void TextSelectionDumper::visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection)
4474 {
4475 words->push_back(new TextWordSelection(word, begin, end));
4476 }
4477
endPage()4478 void TextSelectionDumper::endPage()
4479 {
4480 finishLine();
4481 }
4482
getText()4483 GooString *TextSelectionDumper::getText()
4484 {
4485 GooString *text;
4486 int i;
4487 const UnicodeMap *uMap;
4488 char space[8], eol[16];
4489 int spaceLen, eolLen;
4490
4491 text = new GooString();
4492
4493 if (!(uMap = globalParams->getTextEncoding()))
4494 return text;
4495
4496 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
4497 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
4498
4499 for (i = 0; i < nLines; i++) {
4500 std::vector<TextWordSelection *> *lineWords = lines[i];
4501 for (std::size_t j = 0; j < lineWords->size(); j++) {
4502 TextWordSelection *sel = (*lineWords)[j];
4503
4504 page->dumpFragment(sel->word->text + sel->begin, sel->end - sel->begin, uMap, text);
4505 if (j < lineWords->size() - 1 && sel->word->spaceAfter)
4506 text->append(space, spaceLen);
4507 }
4508 if (i < nLines - 1)
4509 text->append(eol, eolLen);
4510 }
4511
4512 return text;
4513 }
4514
takeWordList(int * nLinesOut)4515 std::vector<TextWordSelection *> **TextSelectionDumper::takeWordList(int *nLinesOut)
4516 {
4517 std::vector<TextWordSelection *> **returnValue = lines;
4518
4519 *nLinesOut = nLines;
4520 if (nLines == 0)
4521 return nullptr;
4522
4523 nLines = 0;
4524 lines = nullptr;
4525
4526 return returnValue;
4527 }
4528
4529 class TextSelectionSizer : public TextSelectionVisitor
4530 {
4531 public:
4532 TextSelectionSizer(TextPage *page, double scale);
~TextSelectionSizer()4533 ~TextSelectionSizer() override { delete list; }
4534
visitBlock(TextBlock * block,TextLine * begin,TextLine * end,const PDFRectangle * selection)4535 void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {};
4536 void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override;
visitWord(TextWord * word,int begin,int end,const PDFRectangle * selection)4537 void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override {};
4538
takeRegion()4539 std::vector<PDFRectangle *> *takeRegion()
4540 {
4541 auto aux = list;
4542 list = nullptr;
4543 return aux;
4544 }
4545
4546 private:
4547 std::vector<PDFRectangle *> *list;
4548 double scale;
4549 };
4550
TextSelectionSizer(TextPage * p,double s)4551 TextSelectionSizer::TextSelectionSizer(TextPage *p, double s) : TextSelectionVisitor(p), scale(s)
4552 {
4553 list = new std::vector<PDFRectangle *>();
4554 }
4555
visitLine(TextLine * line,TextWord * begin,TextWord * end,int edge_begin,int edge_end,const PDFRectangle * selection)4556 void TextSelectionSizer::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection)
4557 {
4558 PDFRectangle *rect;
4559 double x1, y1, x2, y2, margin;
4560
4561 switch (line->rot) {
4562 default:
4563 case 0:
4564 margin = (line->yMax - line->yMin) / 8;
4565 x1 = line->edge[edge_begin];
4566 x2 = line->edge[edge_end];
4567 y1 = line->yMin - margin;
4568 y2 = line->yMax + margin;
4569 break;
4570 case 1:
4571 margin = (line->xMax - line->xMin) / 8;
4572 x1 = line->xMin - margin;
4573 x2 = line->xMax + margin;
4574 y1 = line->edge[edge_begin];
4575 y2 = line->edge[edge_end];
4576 break;
4577 case 2:
4578 margin = (line->yMax - line->yMin) / 8;
4579 x1 = line->edge[edge_end];
4580 x2 = line->edge[edge_begin];
4581 y1 = line->yMin - margin;
4582 y2 = line->yMax + margin;
4583 break;
4584 case 3:
4585 margin = (line->xMax - line->xMin) / 8;
4586 x1 = line->xMin - margin;
4587 x2 = line->xMax + margin;
4588 y1 = line->edge[edge_end];
4589 y2 = line->edge[edge_begin];
4590 break;
4591 }
4592
4593 rect = new PDFRectangle(floor(x1 * scale), floor(y1 * scale), ceil(x2 * scale), ceil(y2 * scale));
4594 list->push_back(rect);
4595 }
4596
4597 class TextSelectionPainter : public TextSelectionVisitor
4598 {
4599 public:
4600 TextSelectionPainter(TextPage *page, double scale, int rotation, OutputDev *out, const GfxColor *box_color, const GfxColor *glyph_color);
4601 ~TextSelectionPainter() override;
4602
visitBlock(TextBlock * block,TextLine * begin,TextLine * end,const PDFRectangle * selection)4603 void visitBlock(TextBlock *block, TextLine *begin, TextLine *end, const PDFRectangle *selection) override {};
4604 void visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection) override;
4605 void visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection) override;
4606 void endPage();
4607
4608 private:
4609 OutputDev *out;
4610 const GfxColor *glyph_color;
4611 GfxState *state;
4612 std::vector<TextWordSelection *> *selectionList;
4613 Matrix ctm, ictm;
4614 bool hasGlyphLessFont();
4615 };
4616
TextSelectionPainter(TextPage * p,double scale,int rotation,OutputDev * outA,const GfxColor * box_color,const GfxColor * glyph_colorA)4617 TextSelectionPainter::TextSelectionPainter(TextPage *p, double scale, int rotation, OutputDev *outA, const GfxColor *box_color, const GfxColor *glyph_colorA) : TextSelectionVisitor(p), out(outA), glyph_color(glyph_colorA)
4618 {
4619 PDFRectangle box(0, 0, p->pageWidth, p->pageHeight);
4620
4621 selectionList = new std::vector<TextWordSelection *>();
4622 state = new GfxState(72 * scale, 72 * scale, &box, rotation, false);
4623
4624 state->getCTM(&ctm);
4625 ctm.invertTo(&ictm);
4626
4627 out->startPage(0, state, nullptr);
4628 out->setDefaultCTM(state->getCTM());
4629
4630 state->setFillColorSpace(new GfxDeviceRGBColorSpace());
4631 state->setFillColor(box_color);
4632 out->updateFillColor(state);
4633 }
4634
~TextSelectionPainter()4635 TextSelectionPainter::~TextSelectionPainter()
4636 {
4637 for (auto entry : *selectionList) {
4638 delete entry;
4639 }
4640 delete selectionList;
4641 delete state;
4642 }
4643
visitLine(TextLine * line,TextWord * begin,TextWord * end,int edge_begin,int edge_end,const PDFRectangle * selection)4644 void TextSelectionPainter::visitLine(TextLine *line, TextWord *begin, TextWord *end, int edge_begin, int edge_end, const PDFRectangle *selection)
4645 {
4646 double x1, y1, x2, y2, margin;
4647
4648 switch (line->rot) {
4649 default:
4650 case 0:
4651 margin = (line->yMax - line->yMin) / 8;
4652 x1 = line->edge[edge_begin];
4653 x2 = line->edge[edge_end];
4654 y1 = line->yMin - margin;
4655 y2 = line->yMax + margin;
4656 break;
4657 case 1:
4658 margin = (line->xMax - line->xMin) / 8;
4659 x1 = line->xMin - margin;
4660 x2 = line->xMax + margin;
4661 y1 = line->edge[edge_begin];
4662 y2 = line->edge[edge_end];
4663 break;
4664 case 2:
4665 margin = (line->yMax - line->yMin) / 8;
4666 x1 = line->edge[edge_end];
4667 x2 = line->edge[edge_begin];
4668 y1 = line->yMin - margin;
4669 y2 = line->yMax + margin;
4670 break;
4671 case 3:
4672 margin = (line->xMax - line->xMin) / 8;
4673 x1 = line->xMin - margin;
4674 x2 = line->xMax + margin;
4675 y1 = line->edge[edge_end];
4676 y2 = line->edge[edge_begin];
4677 break;
4678 }
4679
4680 ctm.transform(x1, y1, &x1, &y1);
4681 ctm.transform(x2, y2, &x2, &y2);
4682
4683 if (x1 < x2) {
4684 x1 = floor(x1);
4685 x2 = ceil(x2);
4686 } else {
4687 x1 = ceil(x1);
4688 x2 = floor(x2);
4689 }
4690
4691 if (y1 < y2) {
4692 y1 = floor(y1);
4693 y2 = ceil(y2);
4694 } else {
4695 y1 = ceil(y1);
4696 y2 = floor(y2);
4697 }
4698
4699 ictm.transform(x1, y1, &x1, &y1);
4700 ictm.transform(x2, y2, &x2, &y2);
4701
4702 state->moveTo(x1, y1);
4703 state->lineTo(x2, y1);
4704 state->lineTo(x2, y2);
4705 state->lineTo(x1, y2);
4706 state->closePath();
4707 }
4708
visitWord(TextWord * word,int begin,int end,const PDFRectangle * selection)4709 void TextSelectionPainter::visitWord(TextWord *word, int begin, int end, const PDFRectangle *selection)
4710 {
4711 selectionList->push_back(new TextWordSelection(word, begin, end));
4712 }
4713
hasGlyphLessFont()4714 bool TextSelectionPainter::hasGlyphLessFont()
4715 {
4716 if (selectionList && selectionList->size()) {
4717 TextWordSelection *sel = (*selectionList)[0];
4718 return sel->word->invisible;
4719 }
4720
4721 return false;
4722 }
4723
endPage()4724 void TextSelectionPainter::endPage()
4725 {
4726 out->fill(state);
4727
4728 out->saveState(state);
4729 out->clip(state);
4730
4731 state->clearPath();
4732
4733 state->setFillColor(glyph_color);
4734
4735 bool usingGlyphLessFont = hasGlyphLessFont();
4736 /* Paint transparent selection when using tesseract glyphless font. Issue #157 */
4737 if (usingGlyphLessFont) {
4738 state->setFillOpacity(glyphlessSelectionOpacity);
4739 }
4740
4741 out->updateFillColor(state);
4742
4743 for (const TextWordSelection *sel : *selectionList) {
4744 int begin = sel->begin;
4745
4746 while (begin < sel->end) {
4747 TextFontInfo *font = sel->word->font[begin];
4748 font->gfxFont->incRefCnt();
4749 Matrix *mat = &sel->word->textMat[begin];
4750
4751 state->setTextMat(mat->m[0], mat->m[1], mat->m[2], mat->m[3], 0, 0);
4752 state->setFont(font->gfxFont, 1);
4753 out->updateFont(state);
4754
4755 int fEnd = begin + 1;
4756 while (fEnd < sel->end && font->matches(sel->word->font[fEnd]) && mat->m[0] == sel->word->textMat[fEnd].m[0] && mat->m[1] == sel->word->textMat[fEnd].m[1] && mat->m[2] == sel->word->textMat[fEnd].m[2]
4757 && mat->m[3] == sel->word->textMat[fEnd].m[3])
4758 fEnd++;
4759
4760 /* The only purpose of this string is to let the output device query
4761 * it's length. Might want to change this interface later. */
4762 GooString *string = new GooString((char *)sel->word->charcode, fEnd - begin);
4763 out->beginString(state, string);
4764
4765 if (!usingGlyphLessFont) {
4766 for (int j = begin; j < fEnd; j++) {
4767 if (j != begin && sel->word->charPos[j] == sel->word->charPos[j - 1])
4768 continue;
4769 out->drawChar(state, sel->word->textMat[j].m[4], sel->word->textMat[j].m[5], 0, 0, 0, 0, sel->word->charcode[j], 1, nullptr, 0);
4770 }
4771 }
4772 out->endString(state);
4773 delete string;
4774 begin = fEnd;
4775 }
4776 }
4777
4778 out->restoreState(state);
4779 out->endPage();
4780 }
4781
visitSelection(TextSelectionVisitor * visitor,const PDFRectangle * selection,SelectionStyle style)4782 void TextWord::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
4783 {
4784 int i, begin, end;
4785 double mid, s1, s2;
4786
4787 if (rot == 0 || rot == 2) {
4788 s1 = selection->x1;
4789 s2 = selection->x2;
4790 } else {
4791 s1 = selection->y1;
4792 s2 = selection->y2;
4793 }
4794
4795 begin = len;
4796 end = 0;
4797 for (i = 0; i < len; i++) {
4798 mid = (edge[i] + edge[i + 1]) / 2;
4799 if (XBetweenAB(mid, s1, s2)) {
4800 if (i < begin)
4801 begin = i;
4802
4803 end = i + 1;
4804 }
4805 }
4806
4807 /* Skip empty selection. */
4808 if (end <= begin)
4809 return;
4810
4811 visitor->visitWord(this, begin, end, selection);
4812 }
4813
visitSelection(TextSelectionVisitor * visitor,const PDFRectangle * selection,SelectionStyle style)4814 void TextLine::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
4815 {
4816 TextWord *p, *begin, *end, *current;
4817 int i, edge_begin, edge_end;
4818 PDFRectangle child_selection;
4819 double s1, s2, pMin, pMax;
4820
4821 if (rot == 0 || rot == 2) {
4822 s1 = selection->x1;
4823 s2 = selection->x2;
4824 } else {
4825 s1 = selection->y1;
4826 s2 = selection->y2;
4827 }
4828
4829 begin = nullptr;
4830 end = nullptr;
4831 current = nullptr;
4832 for (p = words; p != nullptr; p = p->next) {
4833 if (rot == 0 || rot == 2) {
4834 pMin = p->xMin;
4835 pMax = p->xMax;
4836 } else {
4837 pMin = p->yMin;
4838 pMax = p->yMax;
4839 }
4840
4841 if (blk->page->primaryLR) {
4842 if (((s1 < pMax) || (s2 < pMax)) && begin == nullptr)
4843 begin = p;
4844
4845 if (((s1 > pMin) || (s2 > pMin)) && begin != nullptr) {
4846 end = p->next;
4847 current = p;
4848 }
4849 } else {
4850 if (((s1 > pMin) || (s2 > pMin)) && begin == nullptr)
4851 begin = p;
4852
4853 if (((s1 < pMax) || (s2 < pMax)) && begin != nullptr) {
4854 end = p->next;
4855 current = p;
4856 }
4857 }
4858 }
4859
4860 if (!current)
4861 current = begin;
4862
4863 child_selection = *selection;
4864 if (style == selectionStyleWord) {
4865 if (rot == 0 || rot == 2) {
4866 child_selection.x1 = begin ? begin->xMin : xMin;
4867 if (end && end->xMax != -1) {
4868 child_selection.x2 = current->xMax;
4869 } else {
4870 child_selection.x2 = xMax;
4871 }
4872 } else {
4873 child_selection.y1 = begin ? begin->yMin : yMin;
4874 if (end && end->yMax != -1) {
4875 child_selection.y2 = current->yMax;
4876 } else {
4877 child_selection.y2 = yMax;
4878 }
4879 }
4880 }
4881
4882 if (rot == 0 || rot == 2) {
4883 s1 = child_selection.x1;
4884 s2 = child_selection.x2;
4885 } else {
4886 s1 = child_selection.y1;
4887 s2 = child_selection.y2;
4888 }
4889
4890 edge_begin = len;
4891 edge_end = 0;
4892 for (i = 0; i < len; i++) {
4893 double mid = (edge[i] + edge[i + 1]) / 2;
4894 if (XBetweenAB(mid, s1, s2)) {
4895 if (i < edge_begin)
4896 edge_begin = i;
4897
4898 edge_end = i + 1;
4899 }
4900 }
4901
4902 /* Skip empty selection. */
4903 if (edge_end <= edge_begin)
4904 return;
4905
4906 visitor->visitLine(this, begin, end, edge_begin, edge_end, &child_selection);
4907
4908 for (p = begin; p != end; p = p->next)
4909 p->visitSelection(visitor, &child_selection, style);
4910 }
4911
visitSelection(TextSelectionVisitor * visitor,const PDFRectangle * selection,SelectionStyle style)4912 void TextBlock::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
4913 {
4914 PDFRectangle child_selection;
4915 double x[2], y[2], d, best_d[2];
4916 TextLine *p, *best_line[2];
4917 int i, count = 0, best_count[2], start, stop;
4918 bool all[2];
4919
4920 x[0] = selection->x1;
4921 y[0] = selection->y1;
4922 x[1] = selection->x2;
4923 y[1] = selection->y2;
4924
4925 for (i = 0; i < 2; i++) {
4926 // the first/last lines are often not nearest
4927 // the corners, so we have to force them to be
4928 // selected when the selection runs outside this
4929 // block.
4930 if (page->primaryLR) {
4931 all[i] = x[i] >= this->xMax && y[i] >= this->yMax;
4932 if (x[i] <= this->xMin && y[i] <= this->yMin) {
4933 best_line[i] = this->lines;
4934 best_count[i] = 1;
4935 } else {
4936 best_line[i] = nullptr;
4937 best_count[i] = 0;
4938 }
4939 } else {
4940 all[i] = x[i] <= this->xMin && y[i] >= this->yMax;
4941 if (x[i] >= this->xMax && y[i] <= this->yMin) {
4942 best_line[i] = this->lines;
4943 best_count[i] = 1;
4944 } else {
4945 best_line[i] = nullptr;
4946 best_count[i] = 0;
4947 }
4948 }
4949 best_d[i] = 0;
4950 }
4951
4952 // find the nearest line to the selection points
4953 // using the manhattan distance.
4954 for (p = this->lines; p; p = p->next) {
4955 count++;
4956 for (i = 0; i < 2; i++) {
4957 d = fmax(p->xMin - x[i], 0.0) + fmax(x[i] - p->xMax, 0.0) + fmax(p->yMin - y[i], 0.0) + fmax(y[i] - p->yMax, 0.0);
4958 if (!best_line[i] || all[i] || d < best_d[i]) {
4959 best_line[i] = p;
4960 best_count[i] = count;
4961 best_d[i] = d;
4962 }
4963 }
4964 }
4965 // assert: best is always set.
4966 if (!best_line[0] || !best_line[1]) {
4967 return;
4968 }
4969
4970 // Now decide which point was first.
4971 if (best_count[0] < best_count[1] || (best_count[0] == best_count[1] && y[0] < y[1])) {
4972 start = 0;
4973 stop = 1;
4974 } else {
4975 start = 1;
4976 stop = 0;
4977 }
4978
4979 visitor->visitBlock(this, best_line[start], best_line[stop], selection);
4980
4981 for (p = best_line[start]; p; p = p->next) {
4982 if (page->primaryLR) {
4983 child_selection.x1 = p->xMin;
4984 child_selection.x2 = p->xMax;
4985 } else {
4986 child_selection.x1 = p->xMax;
4987 child_selection.x2 = p->xMin;
4988 }
4989 child_selection.y1 = p->yMin;
4990 child_selection.y2 = p->yMax;
4991 if (style == selectionStyleLine) {
4992 if (p == best_line[start]) {
4993 child_selection.x1 = 0;
4994 child_selection.y1 = 0;
4995 }
4996 if (p == best_line[stop]) {
4997 child_selection.x2 = page->pageWidth;
4998 child_selection.y2 = page->pageHeight;
4999 }
5000 } else {
5001 if (p == best_line[start]) {
5002 child_selection.x1 = fmax(p->xMin, fmin(p->xMax, x[start]));
5003 child_selection.y1 = fmax(p->yMin, fmin(p->yMax, y[start]));
5004 }
5005 if (p == best_line[stop]) {
5006 child_selection.x2 = fmax(p->xMin, fmin(p->xMax, x[stop]));
5007 child_selection.y2 = fmax(p->yMin, fmin(p->yMax, y[stop]));
5008 }
5009 }
5010 p->visitSelection(visitor, &child_selection, style);
5011 if (p == best_line[stop]) {
5012 return;
5013 }
5014 }
5015 }
5016
visitSelection(TextSelectionVisitor * visitor,const PDFRectangle * selection,SelectionStyle style)5017 void TextPage::visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style)
5018 {
5019 PDFRectangle child_selection;
5020 double x[2], y[2], d, best_d[2];
5021 double xMin, yMin, xMax, yMax;
5022 TextFlow *flow, *best_flow[2];
5023 TextBlock *blk, *best_block[2];
5024 int i, count = 0, best_count[2], start, stop;
5025
5026 if (!flows)
5027 return;
5028
5029 x[0] = selection->x1;
5030 y[0] = selection->y1;
5031 x[1] = selection->x2;
5032 y[1] = selection->y2;
5033
5034 xMin = pageWidth;
5035 yMin = pageHeight;
5036 xMax = 0.0;
5037 yMax = 0.0;
5038
5039 for (i = 0; i < 2; i++) {
5040 best_block[i] = nullptr;
5041 best_flow[i] = nullptr;
5042 best_count[i] = 0;
5043 best_d[i] = 0;
5044 }
5045
5046 // find the nearest blocks to the selection points
5047 // using the manhattan distance.
5048 for (flow = flows; flow; flow = flow->next) {
5049 for (blk = flow->blocks; blk; blk = blk->next) {
5050 count++;
5051 // the first/last blocks in reading order are
5052 // often not the closest to the page corners;
5053 // track the corners, force those blocks to
5054 // be selected if the selection runs across
5055 // multiple pages.
5056 xMin = fmin(xMin, blk->xMin);
5057 yMin = fmin(yMin, blk->yMin);
5058 xMax = fmax(xMax, blk->xMax);
5059 yMax = fmax(yMax, blk->yMax);
5060 for (i = 0; i < 2; i++) {
5061 d = fmax(blk->xMin - x[i], 0.0) + fmax(x[i] - blk->xMax, 0.0) + fmax(blk->yMin - y[i], 0.0) + fmax(y[i] - blk->yMax, 0.0);
5062 if (!best_block[i] || d < best_d[i] || (!blk->next && !flow->next && x[i] >= fmin(xMax, pageWidth) && y[i] >= fmin(yMax, pageHeight))) {
5063 best_block[i] = blk;
5064 best_flow[i] = flow;
5065 best_count[i] = count;
5066 best_d[i] = d;
5067 }
5068 }
5069 }
5070 }
5071 for (i = 0; i < 2; i++) {
5072 if (primaryLR) {
5073 if (x[i] < xMin && y[i] < yMin) {
5074 best_block[i] = flows->blocks;
5075 best_flow[i] = flows;
5076 best_count[i] = 1;
5077 }
5078 } else {
5079 if (x[i] > xMax && y[i] < yMin) {
5080 best_block[i] = flows->blocks;
5081 best_flow[i] = flows;
5082 best_count[i] = 1;
5083 }
5084 }
5085 }
5086 // assert: best is always set.
5087 if (!best_block[0] || !best_block[1]) {
5088 return;
5089 }
5090
5091 // Now decide which point was first.
5092 if (best_count[0] < best_count[1] || (best_count[0] == best_count[1] && y[0] < y[1])) {
5093 start = 0;
5094 stop = 1;
5095 } else {
5096 start = 1;
5097 stop = 0;
5098 }
5099
5100 for (flow = best_flow[start]; flow; flow = flow->next) {
5101 if (flow == best_flow[start]) {
5102 blk = best_block[start];
5103 } else {
5104 blk = flow->blocks;
5105 }
5106 for (; blk; blk = blk->next) {
5107 if (primaryLR) {
5108 child_selection.x1 = blk->xMin;
5109 child_selection.x2 = blk->xMax;
5110 } else {
5111 child_selection.x1 = blk->xMax;
5112 child_selection.x2 = blk->xMin;
5113 }
5114 child_selection.y1 = blk->yMin;
5115 child_selection.y2 = blk->yMax;
5116 if (blk == best_block[start]) {
5117 child_selection.x1 = fmax(blk->xMin, fmin(blk->xMax, x[start]));
5118 child_selection.y1 = fmax(blk->yMin, fmin(blk->yMax, y[start]));
5119 }
5120 if (blk == best_block[stop]) {
5121 child_selection.x2 = fmax(blk->xMin, fmin(blk->xMax, x[stop]));
5122 child_selection.y2 = fmax(blk->yMin, fmin(blk->yMax, y[stop]));
5123 blk->visitSelection(visitor, &child_selection, style);
5124 return;
5125 }
5126 blk->visitSelection(visitor, &child_selection, style);
5127 }
5128 }
5129 }
5130
drawSelection(OutputDev * out,double scale,int rotation,const PDFRectangle * selection,SelectionStyle style,const GfxColor * glyph_color,const GfxColor * box_color)5131 void TextPage::drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color)
5132 {
5133 TextSelectionPainter painter(this, scale, rotation, out, box_color, glyph_color);
5134
5135 visitSelection(&painter, selection, style);
5136 painter.endPage();
5137 }
5138
getSelectionRegion(const PDFRectangle * selection,SelectionStyle style,double scale)5139 std::vector<PDFRectangle *> *TextPage::getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale)
5140 {
5141 TextSelectionSizer sizer(this, scale);
5142
5143 visitSelection(&sizer, selection, style);
5144
5145 return sizer.takeRegion();
5146 }
5147
getSelectionText(const PDFRectangle * selection,SelectionStyle style)5148 GooString *TextPage::getSelectionText(const PDFRectangle *selection, SelectionStyle style)
5149 {
5150 TextSelectionDumper dumper(this);
5151
5152 visitSelection(&dumper, selection, style);
5153 dumper.endPage();
5154
5155 return dumper.getText();
5156 }
5157
getSelectionWords(const PDFRectangle * selection,SelectionStyle style,int * nLines)5158 std::vector<TextWordSelection *> **TextPage::getSelectionWords(const PDFRectangle *selection, SelectionStyle style, int *nLines)
5159 {
5160 TextSelectionDumper dumper(this);
5161
5162 visitSelection(&dumper, selection, style);
5163 dumper.endPage();
5164
5165 return dumper.takeWordList(nLines);
5166 }
5167
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax) const5168 bool TextPage::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const
5169 {
5170 TextBlock *blk;
5171 TextLine *line;
5172 TextWord *word;
5173 double xMin0, xMax0, yMin0, yMax0;
5174 double xMin1, xMax1, yMin1, yMax1;
5175 bool first;
5176 int i, j0, j1;
5177
5178 if (rawOrder) {
5179 return false;
5180 }
5181
5182 //~ this doesn't correctly handle ranges split across multiple lines
5183 //~ (the highlighted region is the bounding box of all the parts of
5184 //~ the range)
5185 first = true;
5186 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
5187 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
5188 for (i = 0; i < nBlocks; ++i) {
5189 blk = blocks[i];
5190 for (line = blk->lines; line; line = line->next) {
5191 for (word = line->words; word; word = word->next) {
5192 if (pos < word->charPos[word->len] && pos + length > word->charPos[0]) {
5193 for (j0 = 0; j0 < word->len && pos >= word->charPos[j0 + 1]; ++j0)
5194 ;
5195 for (j1 = word->len - 1; j1 > j0 && pos + length <= word->charPos[j1]; --j1)
5196 ;
5197 switch (line->rot) {
5198 case 0:
5199 xMin1 = word->edge[j0];
5200 xMax1 = word->edge[j1 + 1];
5201 yMin1 = word->yMin;
5202 yMax1 = word->yMax;
5203 break;
5204 case 1:
5205 xMin1 = word->xMin;
5206 xMax1 = word->xMax;
5207 yMin1 = word->edge[j0];
5208 yMax1 = word->edge[j1 + 1];
5209 break;
5210 case 2:
5211 xMin1 = word->edge[j1 + 1];
5212 xMax1 = word->edge[j0];
5213 yMin1 = word->yMin;
5214 yMax1 = word->yMax;
5215 break;
5216 case 3:
5217 xMin1 = word->xMin;
5218 xMax1 = word->xMax;
5219 yMin1 = word->edge[j1 + 1];
5220 yMax1 = word->edge[j0];
5221 break;
5222 }
5223 if (first || xMin1 < xMin0) {
5224 xMin0 = xMin1;
5225 }
5226 if (first || xMax1 > xMax0) {
5227 xMax0 = xMax1;
5228 }
5229 if (first || yMin1 < yMin0) {
5230 yMin0 = yMin1;
5231 }
5232 if (first || yMax1 > yMax0) {
5233 yMax0 = yMax1;
5234 }
5235 first = false;
5236 }
5237 }
5238 }
5239 }
5240 if (!first) {
5241 *xMin = xMin0;
5242 *xMax = xMax0;
5243 *yMin = yMin0;
5244 *yMax = yMax0;
5245 return true;
5246 }
5247 return false;
5248 }
5249
dump(void * outputStream,TextOutputFunc outputFunc,bool physLayout,EndOfLineKind textEOL,bool pageBreaks)5250 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc, bool physLayout, EndOfLineKind textEOL, bool pageBreaks)
5251 {
5252 const UnicodeMap *uMap;
5253 TextFlow *flow;
5254 TextBlock *blk;
5255 TextLine *line;
5256 TextLineFrag *frags;
5257 TextWord *word;
5258 int nFrags, fragsSize;
5259 TextLineFrag *frag;
5260 char space[8], eol[16], eop[8];
5261 int spaceLen, eolLen, eopLen;
5262 GooString *s;
5263 double delta;
5264 int col, i, j, d, n;
5265
5266 // get the output encoding
5267 if (!(uMap = globalParams->getTextEncoding())) {
5268 return;
5269 }
5270 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
5271 eolLen = 0; // make gcc happy
5272 switch (textEOL) {
5273 case eolUnix:
5274 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
5275 break;
5276 case eolDOS:
5277 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
5278 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
5279 break;
5280 case eolMac:
5281 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
5282 break;
5283 }
5284 eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
5285
5286 //~ writing mode (horiz/vert)
5287
5288 // output the page in raw (content stream) order
5289 if (rawOrder) {
5290
5291 for (word = rawWords; word; word = word->next) {
5292 s = new GooString();
5293 dumpFragment(word->text, word->len, uMap, s);
5294 (*outputFunc)(outputStream, s->c_str(), s->getLength());
5295 delete s;
5296 if (word->next && fabs(word->next->base - word->base) < maxIntraLineDelta * word->fontSize && word->next->xMin > word->xMax - minDupBreakOverlap * word->fontSize) {
5297 if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
5298 (*outputFunc)(outputStream, space, spaceLen);
5299 }
5300 } else {
5301 (*outputFunc)(outputStream, eol, eolLen);
5302 }
5303 }
5304
5305 // output the page, maintaining the original physical layout
5306 } else if (physLayout) {
5307
5308 // collect the line fragments for the page and sort them
5309 fragsSize = 256;
5310 frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
5311 nFrags = 0;
5312 for (i = 0; i < nBlocks; ++i) {
5313 blk = blocks[i];
5314 for (line = blk->lines; line; line = line->next) {
5315 if (nFrags == fragsSize) {
5316 fragsSize *= 2;
5317 frags = (TextLineFrag *)greallocn(frags, fragsSize, sizeof(TextLineFrag));
5318 }
5319 frags[nFrags].init(line, 0, line->len);
5320 frags[nFrags].computeCoords(true);
5321 ++nFrags;
5322 }
5323 }
5324 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
5325 i = 0;
5326 while (i < nFrags) {
5327 delta = maxIntraLineDelta * frags[i].line->words->fontSize;
5328 for (j = i + 1; j < nFrags && fabs(frags[j].base - frags[i].base) < delta; ++j)
5329 ;
5330 qsort(frags + i, j - i, sizeof(TextLineFrag), &TextLineFrag::cmpXYColumnPrimaryRot);
5331 i = j;
5332 }
5333
5334 #if 0 // for debugging
5335 printf("*** line fragments ***\n");
5336 for (i = 0; i < nFrags; ++i) {
5337 frag = &frags[i];
5338 printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
5339 frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base);
5340 for (n = 0; n < frag->len; ++n) {
5341 fputc(frag->line->text[frag->start + n] & 0xff, stdout);
5342 }
5343 printf("'\n");
5344 }
5345 printf("\n");
5346 #endif
5347
5348 // generate output
5349 col = 0;
5350 for (i = 0; i < nFrags; ++i) {
5351 frag = &frags[i];
5352
5353 // column alignment
5354 for (; col < frag->col; ++col) {
5355 (*outputFunc)(outputStream, space, spaceLen);
5356 }
5357
5358 // print the line
5359 s = new GooString();
5360 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
5361 (*outputFunc)(outputStream, s->c_str(), s->getLength());
5362 delete s;
5363
5364 // print one or more returns if necessary
5365 if (i == nFrags - 1 || frags[i + 1].col < col || fabs(frags[i + 1].base - frag->base) > maxIntraLineDelta * frag->line->words->fontSize) {
5366 if (i < nFrags - 1) {
5367 d = (int)((frags[i + 1].base - frag->base) / frag->line->words->fontSize);
5368 if (d < 1) {
5369 d = 1;
5370 } else if (d > 5) {
5371 d = 5;
5372 }
5373 } else {
5374 d = 1;
5375 }
5376 for (; d > 0; --d) {
5377 (*outputFunc)(outputStream, eol, eolLen);
5378 }
5379 col = 0;
5380 }
5381 }
5382
5383 gfree(frags);
5384
5385 // output the page, "undoing" the layout
5386 } else {
5387 for (flow = flows; flow; flow = flow->next) {
5388 for (blk = flow->blocks; blk; blk = blk->next) {
5389 for (line = blk->lines; line; line = line->next) {
5390 n = line->len;
5391 if (line->hyphenated && (line->next || blk->next)) {
5392 --n;
5393 }
5394 s = new GooString();
5395 dumpFragment(line->text, n, uMap, s);
5396 (*outputFunc)(outputStream, s->c_str(), s->getLength());
5397 delete s;
5398 // output a newline when a hyphen is not suppressed
5399 if (n == line->len) {
5400 (*outputFunc)(outputStream, eol, eolLen);
5401 }
5402 }
5403 }
5404 (*outputFunc)(outputStream, eol, eolLen);
5405 }
5406 }
5407
5408 // end of page
5409 if (pageBreaks) {
5410 (*outputFunc)(outputStream, eop, eopLen);
5411 }
5412 }
5413
setMergeCombining(bool merge)5414 void TextPage::setMergeCombining(bool merge)
5415 {
5416 mergeCombining = merge;
5417 }
5418
assignColumns(TextLineFrag * frags,int nFrags,bool oneRot) const5419 void TextPage::assignColumns(TextLineFrag *frags, int nFrags, bool oneRot) const
5420 {
5421 TextLineFrag *frag0, *frag1;
5422 int rot, col1, col2, i, j, k;
5423
5424 // all text in the region has the same rotation -- recompute the
5425 // column numbers based only on the text in the region
5426 if (oneRot) {
5427 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
5428 rot = frags[0].line->rot;
5429 for (i = 0; i < nFrags; ++i) {
5430 frag0 = &frags[i];
5431 col1 = 0;
5432 for (j = 0; j < i; ++j) {
5433 frag1 = &frags[j];
5434 col2 = 0; // make gcc happy
5435 switch (rot) {
5436 case 0:
5437 if (frag0->xMin >= frag1->xMax) {
5438 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1;
5439 } else {
5440 for (k = frag1->start; k < frag1->start + frag1->len && frag0->xMin >= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k)
5441 ;
5442 col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5443 }
5444 break;
5445 case 1:
5446 if (frag0->yMin >= frag1->yMax) {
5447 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1;
5448 } else {
5449 for (k = frag1->start; k < frag1->start + frag1->len && frag0->yMin >= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k)
5450 ;
5451 col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5452 }
5453 break;
5454 case 2:
5455 if (frag0->xMax <= frag1->xMin) {
5456 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1;
5457 } else {
5458 for (k = frag1->start; k < frag1->start + frag1->len && frag0->xMax <= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k)
5459 ;
5460 col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5461 }
5462 break;
5463 case 3:
5464 if (frag0->yMax <= frag1->yMin) {
5465 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] - frag1->line->col[frag1->start]) + 1;
5466 } else {
5467 for (k = frag1->start; k < frag1->start + frag1->len && frag0->yMax <= 0.5 * (frag1->line->edge[k] + frag1->line->edge[k + 1]); ++k)
5468 ;
5469 col2 = frag1->col + frag1->line->col[k] - frag1->line->col[frag1->start];
5470 }
5471 break;
5472 }
5473 if (col2 > col1) {
5474 col1 = col2;
5475 }
5476 }
5477 frag0->col = col1;
5478 }
5479
5480 // the region includes text at different rotations -- use the
5481 // globally assigned column numbers, offset by the minimum column
5482 // number (i.e., shift everything over to column 0)
5483 } else {
5484 col1 = frags[0].col;
5485 for (i = 1; i < nFrags; ++i) {
5486 if (frags[i].col < col1) {
5487 col1 = frags[i].col;
5488 }
5489 }
5490 for (i = 0; i < nFrags; ++i) {
5491 frags[i].col -= col1;
5492 }
5493 }
5494 }
5495
dumpFragment(const Unicode * text,int len,const UnicodeMap * uMap,GooString * s) const5496 int TextPage::dumpFragment(const Unicode *text, int len, const UnicodeMap *uMap, GooString *s) const
5497 {
5498 if (uMap->isUnicode()) {
5499 return reorderText(text, len, uMap, primaryLR, s, nullptr);
5500 } else {
5501 int nCols = 0;
5502
5503 char buf[8];
5504 int buflen = 0;
5505
5506 for (int i = 0; i < len; ++i) {
5507 buflen = uMap->mapUnicode(text[i], buf, sizeof(buf));
5508 s->append(buf, buflen);
5509 nCols += buflen;
5510 }
5511
5512 return nCols;
5513 }
5514 }
5515
5516 #ifdef TEXTOUT_WORD_LIST
makeWordList(bool physLayout)5517 std::unique_ptr<TextWordList> TextPage::makeWordList(bool physLayout)
5518 {
5519 return std::make_unique<TextWordList>(this, physLayout);
5520 }
5521 #endif
5522
5523 //------------------------------------------------------------------------
5524 // ActualText
5525 //------------------------------------------------------------------------
ActualText(TextPage * out)5526 ActualText::ActualText(TextPage *out)
5527 {
5528 out->incRefCnt();
5529 text = out;
5530 actualText = nullptr;
5531 actualTextNBytes = 0;
5532 }
5533
~ActualText()5534 ActualText::~ActualText()
5535 {
5536 if (actualText)
5537 delete actualText;
5538 text->decRefCnt();
5539 }
5540
addChar(const GfxState * state,double x,double y,double dx,double dy,CharCode c,int nBytes,const Unicode * u,int uLen)5541 void ActualText::addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen)
5542 {
5543 if (!actualText) {
5544 text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5545 return;
5546 }
5547
5548 // Inside ActualText span.
5549 if (!actualTextNBytes) {
5550 actualTextX0 = x;
5551 actualTextY0 = y;
5552 }
5553 actualTextX1 = x + dx;
5554 actualTextY1 = y + dy;
5555 actualTextNBytes += nBytes;
5556 }
5557
begin(const GfxState * state,const GooString * t)5558 void ActualText::begin(const GfxState *state, const GooString *t)
5559 {
5560 if (actualText)
5561 delete actualText;
5562 actualText = new GooString(t);
5563 actualTextNBytes = 0;
5564 }
5565
end(const GfxState * state)5566 void ActualText::end(const GfxState *state)
5567 {
5568 // ActualText span closed. Output the span text and the
5569 // extents of all the glyphs inside the span
5570
5571 if (actualTextNBytes) {
5572 Unicode *uni = nullptr;
5573 int length;
5574
5575 // now that we have the position info for all of the text inside
5576 // the marked content span, we feed the "ActualText" back through
5577 // text->addChar()
5578 length = TextStringToUCS4(actualText->toStr(), &uni);
5579 text->addChar(state, actualTextX0, actualTextY0, actualTextX1 - actualTextX0, actualTextY1 - actualTextY0, 0, actualTextNBytes, uni, length);
5580 gfree(uni);
5581 }
5582
5583 delete actualText;
5584 actualText = nullptr;
5585 actualTextNBytes = 0;
5586 }
5587
5588 //------------------------------------------------------------------------
5589 // TextOutputDev
5590 //------------------------------------------------------------------------
5591
TextOutputDev_outputToFile(void * stream,const char * text,int len)5592 static void TextOutputDev_outputToFile(void *stream, const char *text, int len)
5593 {
5594 fwrite(text, 1, len, (FILE *)stream);
5595 }
5596
TextOutputDev(const char * fileName,bool physLayoutA,double fixedPitchA,bool rawOrderA,bool append,bool discardDiagA)5597 TextOutputDev::TextOutputDev(const char *fileName, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool append, bool discardDiagA)
5598 {
5599 text = nullptr;
5600 physLayout = physLayoutA;
5601 fixedPitch = physLayout ? fixedPitchA : 0;
5602 rawOrder = rawOrderA;
5603 discardDiag = discardDiagA;
5604 doHTML = false;
5605 textEOL = defaultEndOfLine();
5606 textPageBreaks = true;
5607 ok = true;
5608
5609 // open file
5610 needClose = false;
5611 if (fileName) {
5612 if (!strcmp(fileName, "-")) {
5613 outputStream = stdout;
5614 #if defined(_WIN32) || defined(__CYGWIN__)
5615 // keep DOS from munging the end-of-line characters
5616 _setmode(fileno(stdout), O_BINARY);
5617 #endif
5618 } else if ((outputStream = openFile(fileName, append ? "ab" : "wb"))) {
5619 needClose = true;
5620 } else {
5621 error(errIO, -1, "Couldn't open text file '{0:s}'", fileName);
5622 ok = false;
5623 actualText = nullptr;
5624 return;
5625 }
5626 outputFunc = &TextOutputDev_outputToFile;
5627 } else {
5628 outputStream = nullptr;
5629 }
5630
5631 // set up text object
5632 text = new TextPage(rawOrderA, discardDiagA);
5633 actualText = new ActualText(text);
5634 }
5635
TextOutputDev(TextOutputFunc func,void * stream,bool physLayoutA,double fixedPitchA,bool rawOrderA,bool discardDiagA)5636 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool discardDiagA)
5637 {
5638 outputFunc = func;
5639 outputStream = stream;
5640 needClose = false;
5641 physLayout = physLayoutA;
5642 fixedPitch = physLayout ? fixedPitchA : 0;
5643 rawOrder = rawOrderA;
5644 discardDiag = discardDiagA;
5645 doHTML = false;
5646 text = new TextPage(rawOrderA, discardDiagA);
5647 actualText = new ActualText(text);
5648 textEOL = defaultEndOfLine();
5649 textPageBreaks = true;
5650 ok = true;
5651 }
5652
~TextOutputDev()5653 TextOutputDev::~TextOutputDev()
5654 {
5655 if (needClose) {
5656 fclose((FILE *)outputStream);
5657 }
5658 if (text) {
5659 text->decRefCnt();
5660 }
5661 delete actualText;
5662 }
5663
startPage(int pageNum,GfxState * state,XRef * xref)5664 void TextOutputDev::startPage(int pageNum, GfxState *state, XRef *xref)
5665 {
5666 text->startPage(state);
5667 }
5668
endPage()5669 void TextOutputDev::endPage()
5670 {
5671 text->endPage();
5672 text->coalesce(physLayout, fixedPitch, doHTML);
5673 if (outputStream) {
5674 text->dump(outputStream, outputFunc, physLayout, textEOL, textPageBreaks);
5675 }
5676 }
5677
restoreState(GfxState * state)5678 void TextOutputDev::restoreState(GfxState *state)
5679 {
5680 text->updateFont(state);
5681 }
5682
updateFont(GfxState * state)5683 void TextOutputDev::updateFont(GfxState *state)
5684 {
5685 text->updateFont(state);
5686 }
5687
beginString(GfxState * state,const GooString * s)5688 void TextOutputDev::beginString(GfxState *state, const GooString *s) { }
5689
endString(GfxState * state)5690 void TextOutputDev::endString(GfxState *state) { }
5691
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode c,int nBytes,const Unicode * u,int uLen)5692 void TextOutputDev::drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, const Unicode *u, int uLen)
5693 {
5694 actualText->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5695 }
5696
incCharCount(int nChars)5697 void TextOutputDev::incCharCount(int nChars)
5698 {
5699 text->incCharCount(nChars);
5700 }
5701
beginActualText(GfxState * state,const GooString * t)5702 void TextOutputDev::beginActualText(GfxState *state, const GooString *t)
5703 {
5704 actualText->begin(state, t);
5705 }
5706
endActualText(GfxState * state)5707 void TextOutputDev::endActualText(GfxState *state)
5708 {
5709 actualText->end(state);
5710 }
5711
stroke(GfxState * state)5712 void TextOutputDev::stroke(GfxState *state)
5713 {
5714 double x[2], y[2];
5715
5716 if (!doHTML) {
5717 return;
5718 }
5719 const GfxPath *path = state->getPath();
5720 if (path->getNumSubpaths() != 1) {
5721 return;
5722 }
5723 const GfxSubpath *subpath = path->getSubpath(0);
5724 if (subpath->getNumPoints() != 2) {
5725 return;
5726 }
5727 state->transform(subpath->getX(0), subpath->getY(0), &x[0], &y[0]);
5728 state->transform(subpath->getX(1), subpath->getY(1), &x[1], &y[1]);
5729
5730 // look for a vertical or horizontal line
5731 if (x[0] == x[1] || y[0] == y[1]) {
5732 text->addUnderline(x[0], y[0], x[1], y[1]);
5733 }
5734 }
5735
fill(GfxState * state)5736 void TextOutputDev::fill(GfxState *state)
5737 {
5738 double x[5], y[5];
5739 double rx0, ry0, rx1, ry1, t;
5740 int i;
5741
5742 if (!doHTML) {
5743 return;
5744 }
5745 const GfxPath *path = state->getPath();
5746 if (path->getNumSubpaths() != 1) {
5747 return;
5748 }
5749 const GfxSubpath *subpath = path->getSubpath(0);
5750 if (subpath->getNumPoints() != 5) {
5751 return;
5752 }
5753 for (i = 0; i < 5; ++i) {
5754 if (subpath->getCurve(i)) {
5755 return;
5756 }
5757 state->transform(subpath->getX(i), subpath->getY(i), &x[i], &y[i]);
5758 }
5759
5760 // look for a rectangle
5761 if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] && x[0] == x[4] && y[0] == y[4]) {
5762 rx0 = x[0];
5763 ry0 = y[0];
5764 rx1 = x[2];
5765 ry1 = y[1];
5766 } else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] && x[0] == x[4] && y[0] == y[4]) {
5767 rx0 = x[0];
5768 ry0 = y[0];
5769 rx1 = x[1];
5770 ry1 = y[2];
5771 } else {
5772 return;
5773 }
5774 if (rx1 < rx0) {
5775 t = rx0;
5776 rx0 = rx1;
5777 rx1 = t;
5778 }
5779 if (ry1 < ry0) {
5780 t = ry0;
5781 ry0 = ry1;
5782 ry1 = t;
5783 }
5784
5785 // skinny horizontal rectangle
5786 if (ry1 - ry0 < rx1 - rx0) {
5787 if (ry1 - ry0 < maxUnderlineWidth) {
5788 ry0 = 0.5 * (ry0 + ry1);
5789 text->addUnderline(rx0, ry0, rx1, ry0);
5790 }
5791
5792 // skinny vertical rectangle
5793 } else {
5794 if (rx1 - rx0 < maxUnderlineWidth) {
5795 rx0 = 0.5 * (rx0 + rx1);
5796 text->addUnderline(rx0, ry0, rx0, ry1);
5797 }
5798 }
5799 }
5800
eoFill(GfxState * state)5801 void TextOutputDev::eoFill(GfxState *state)
5802 {
5803 if (!doHTML) {
5804 return;
5805 }
5806 fill(state);
5807 }
5808
processLink(AnnotLink * link)5809 void TextOutputDev::processLink(AnnotLink *link)
5810 {
5811 double x1, y1, x2, y2;
5812 int xMin, yMin, xMax, yMax, x, y;
5813
5814 if (!doHTML) {
5815 return;
5816 }
5817 link->getRect(&x1, &y1, &x2, &y2);
5818 cvtUserToDev(x1, y1, &x, &y);
5819 xMin = xMax = x;
5820 yMin = yMax = y;
5821 cvtUserToDev(x1, y2, &x, &y);
5822 if (x < xMin) {
5823 xMin = x;
5824 } else if (x > xMax) {
5825 xMax = x;
5826 }
5827 if (y < yMin) {
5828 yMin = y;
5829 } else if (y > yMax) {
5830 yMax = y;
5831 }
5832 cvtUserToDev(x2, y1, &x, &y);
5833 if (x < xMin) {
5834 xMin = x;
5835 } else if (x > xMax) {
5836 xMax = x;
5837 }
5838 if (y < yMin) {
5839 yMin = y;
5840 } else if (y > yMax) {
5841 yMax = y;
5842 }
5843 cvtUserToDev(x2, y2, &x, &y);
5844 if (x < xMin) {
5845 xMin = x;
5846 } else if (x > xMax) {
5847 xMax = x;
5848 }
5849 if (y < yMin) {
5850 yMin = y;
5851 } else if (y > yMax) {
5852 yMax = y;
5853 }
5854 text->addLink(xMin, yMin, xMax, yMax, link);
5855 }
5856
findText(const Unicode * s,int len,bool startAtTop,bool stopAtBottom,bool startAtLast,bool stopAtLast,bool caseSensitive,bool backward,bool wholeWord,double * xMin,double * yMin,double * xMax,double * yMax) const5857 bool TextOutputDev::findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) const
5858 {
5859 return text->findText(s, len, startAtTop, stopAtBottom, startAtLast, stopAtLast, caseSensitive, backward, wholeWord, xMin, yMin, xMax, yMax);
5860 }
5861
getText(double xMin,double yMin,double xMax,double yMax) const5862 GooString *TextOutputDev::getText(double xMin, double yMin, double xMax, double yMax) const
5863 {
5864 return text->getText(xMin, yMin, xMax, yMax, textEOL);
5865 }
5866
drawSelection(OutputDev * out,double scale,int rotation,const PDFRectangle * selection,SelectionStyle style,const GfxColor * glyph_color,const GfxColor * box_color)5867 void TextOutputDev::drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color)
5868 {
5869 text->drawSelection(out, scale, rotation, selection, style, glyph_color, box_color);
5870 }
5871
getSelectionRegion(const PDFRectangle * selection,SelectionStyle style,double scale)5872 std::vector<PDFRectangle *> *TextOutputDev::getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale)
5873 {
5874 return text->getSelectionRegion(selection, style, scale);
5875 }
5876
getSelectionText(const PDFRectangle * selection,SelectionStyle style)5877 GooString *TextOutputDev::getSelectionText(const PDFRectangle *selection, SelectionStyle style)
5878 {
5879 return text->getSelectionText(selection, style);
5880 }
5881
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax) const5882 bool TextOutputDev::findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const
5883 {
5884 return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
5885 }
5886
setMergeCombining(bool merge)5887 void TextOutputDev::setMergeCombining(bool merge)
5888 {
5889 text->setMergeCombining(merge);
5890 }
5891
5892 #ifdef TEXTOUT_WORD_LIST
makeWordList()5893 std::unique_ptr<TextWordList> TextOutputDev::makeWordList()
5894 {
5895 return text->makeWordList(physLayout);
5896 }
5897 #endif
5898
takeText()5899 TextPage *TextOutputDev::takeText()
5900 {
5901 TextPage *ret;
5902
5903 ret = text;
5904 text = new TextPage(rawOrder, discardDiag);
5905 return ret;
5906 }
5907
getFlows() const5908 const TextFlow *TextOutputDev::getFlows() const
5909 {
5910 return text->getFlows();
5911 }
5912