1 //========================================================================
2 //
3 // TextOutputDev.cc
4 //
5 // Copyright 1997-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
14 // Copyright (C) 2005 Nickolay V. Shmyrev <nshmyrev@yandex.ru>
15 // Copyright (C) 2006-2008 Carlos Garcia Campos <carlosgc@gnome.org>
16 // Copyright (C) 2006, 2007 Ed Catmur <ed@catmur.co.uk>
17 // Copyright (C) 2006 Jeff Muizelaar <jeff@infidigm.net>
18 // Copyright (C) 2007, 2008 Adrian Johnson <ajohnson@redneon.com>
19 // Copyright (C) 2008 Koji Otani <sho@bbr.jp>
20 // Copyright (C) 2008, 2010 Albert Astals Cid <aacid@kde.org>
21 // Copyright (C) 2008 Pino Toscano <pino@kde.org>
22 // Copyright (C) 2008, 2010 Hib Eris <hib@hiberis.nl>
23 // Copyright (C) 2009 Ross Moore <ross@maths.mq.edu.au>
24 // Copyright (C) 2009 Kovid Goyal <kovid@kovidgoyal.net>
25 // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
26 // Copyright (C) 2010 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
27 //
28 // To see a description of the changes please see the Changelog file that
29 // came with your tarball or type make ChangeLog if you are building from git
30 //
31 //========================================================================
32
33 #include <config.h>
34
35 #ifdef USE_GCC_PRAGMAS
36 #pragma implementation
37 #endif
38
39 #include <stdio.h>
40 #include <stdlib.h>
41 #include <stddef.h>
42 #include <math.h>
43 #include <float.h>
44 #include <ctype.h>
45 #ifdef _WIN32
46 #include <fcntl.h> // for O_BINARY
47 #include <io.h> // for setmode
48 #endif
49 #include "goo/gmem.h"
50 #include "goo/GooString.h"
51 #include "goo/GooList.h"
52 #include "poppler-config.h"
53 #include "Error.h"
54 #include "GlobalParams.h"
55 #include "UnicodeMap.h"
56 #include "UnicodeTypeTable.h"
57 #include "Link.h"
58 #include "TextOutputDev.h"
59 #include "Page.h"
60 #include "PDFDocEncoding.h"
61
62 #ifdef MACOS
63 // needed for setting type/creator of MacOS files
64 #include "ICSupport.h"
65 #endif
66
67 //------------------------------------------------------------------------
68 // parameters
69 //------------------------------------------------------------------------
70
71 // Each bucket in a text pool includes baselines within a range of
72 // this many points.
73 #define textPoolStep 4
74
75 // Inter-character space width which will cause addChar to start a new
76 // word.
77 #define minWordBreakSpace 0.1
78
79 // Negative inter-character space width, i.e., overlap, which will
80 // cause addChar to start a new word.
81 #define minDupBreakOverlap 0.2
82
83 // Max distance between baselines of two lines within a block, as a
84 // fraction of the font size.
85 #define maxLineSpacingDelta 1.5
86
87 // Max difference in primary font sizes on two lines in the same
88 // block. Delta1 is used when examining new lines above and below the
89 // current block; delta2 is used when examining text that overlaps the
90 // current block; delta3 is used when examining text to the left and
91 // right of the current block.
92 #define maxBlockFontSizeDelta1 0.05
93 #define maxBlockFontSizeDelta2 0.6
94 #define maxBlockFontSizeDelta3 0.2
95
96 // Max difference in font sizes inside a word.
97 #define maxWordFontSizeDelta 0.05
98
99 // Maximum distance between baselines of two words on the same line,
100 // e.g., distance between subscript or superscript and the primary
101 // baseline, as a fraction of the font size.
102 #define maxIntraLineDelta 0.5
103
104 // Minimum inter-word spacing, as a fraction of the font size. (Only
105 // used for raw ordering.)
106 #define minWordSpacing 0.15
107
108 // Maximum inter-word spacing, as a fraction of the font size.
109 #define maxWordSpacing 1.5
110
111 // Maximum horizontal spacing which will allow a word to be pulled
112 // into a block.
113 #define minColSpacing1 0.3
114
115 // Minimum spacing between columns, as a fraction of the font size.
116 #define minColSpacing2 1.0
117
118 // Maximum vertical spacing between blocks within a flow, as a
119 // multiple of the font size.
120 #define maxBlockSpacing 2.5
121
122 // Minimum spacing between characters within a word, as a fraction of
123 // the font size.
124 #define minCharSpacing -0.2
125
126 // Maximum spacing between characters within a word, as a fraction of
127 // the font size, when there is no obvious extra-wide character
128 // spacing.
129 #define maxCharSpacing 0.03
130
131 // When extra-wide character spacing is detected, the inter-character
132 // space threshold is set to the minimum inter-character space
133 // multiplied by this constant.
134 #define maxWideCharSpacingMul 1.3
135
136 // Upper limit on spacing between characters in a word.
137 #define maxWideCharSpacing 0.4
138
139 // Max difference in primary,secondary coordinates (as a fraction of
140 // the font size) allowed for duplicated text (fake boldface, drop
141 // shadows) which is to be discarded.
142 #define dupMaxPriDelta 0.1
143 #define dupMaxSecDelta 0.2
144
145 // Max width of underlines (in points).
146 #define maxUnderlineWidth 3
147
148 // Min distance between baseline and underline (in points).
149 //~ this should be font-size-dependent
150 #define minUnderlineGap -2
151
152 // Max distance between baseline and underline (in points).
153 //~ this should be font-size-dependent
154 #define maxUnderlineGap 4
155
156 // Max horizontal distance between edge of word and start of underline
157 // (in points).
158 //~ this should be font-size-dependent
159 #define underlineSlack 1
160
161 // Max distance between edge of text and edge of link border
162 #define hyperlinkSlack 2
163
164 //------------------------------------------------------------------------
165 // TextUnderline
166 //------------------------------------------------------------------------
167
168 class TextUnderline {
169 public:
170
TextUnderline(double x0A,double y0A,double x1A,double y1A)171 TextUnderline(double x0A, double y0A, double x1A, double y1A)
172 { x0 = x0A; y0 = y0A; x1 = x1A; y1 = y1A; horiz = y0 == y1; }
~TextUnderline()173 ~TextUnderline() {}
174
175 double x0, y0, x1, y1;
176 GBool horiz;
177 };
178
179 //------------------------------------------------------------------------
180 // TextLink
181 //------------------------------------------------------------------------
182
183 class TextLink {
184 public:
185
TextLink(int xMinA,int yMinA,int xMaxA,int yMaxA,Link * linkA)186 TextLink(int xMinA, int yMinA, int xMaxA, int yMaxA, Link *linkA)
187 { xMin = xMinA; yMin = yMinA; xMax = xMaxA; yMax = yMaxA; link = linkA; }
~TextLink()188 ~TextLink() {}
189
190 int xMin, yMin, xMax, yMax;
191 Link *link;
192 };
193
194 //------------------------------------------------------------------------
195 // TextFontInfo
196 //------------------------------------------------------------------------
197
TextFontInfo(GfxState * state)198 TextFontInfo::TextFontInfo(GfxState *state) {
199 gfxFont = state->getFont();
200 if (gfxFont)
201 gfxFont->incRefCnt();
202 #if TEXTOUT_WORD_LIST
203 fontName = (gfxFont && gfxFont->getOrigName())
204 ? gfxFont->getOrigName()->copy()
205 : (GooString *)NULL;
206 flags = gfxFont ? gfxFont->getFlags() : 0;
207 #endif
208 }
209
~TextFontInfo()210 TextFontInfo::~TextFontInfo() {
211 if (gfxFont)
212 gfxFont->decRefCnt();
213 #if TEXTOUT_WORD_LIST
214 if (fontName) {
215 delete fontName;
216 }
217 #endif
218 }
219
matches(GfxState * state)220 GBool TextFontInfo::matches(GfxState *state) {
221 return state->getFont() == gfxFont;
222 }
223
224 //------------------------------------------------------------------------
225 // TextWord
226 //------------------------------------------------------------------------
227
TextWord(GfxState * state,int rotA,double x0,double y0,int charPosA,TextFontInfo * fontA,double fontSizeA)228 TextWord::TextWord(GfxState *state, int rotA, double x0, double y0,
229 int charPosA, TextFontInfo *fontA, double fontSizeA) {
230 GfxFont *gfxFont;
231 double x, y, ascent, descent;
232
233 rot = rotA;
234 charPos = charPosA;
235 charLen = 0;
236 font = fontA;
237 fontSize = fontSizeA;
238 state->transform(x0, y0, &x, &y);
239 if ((gfxFont = font->gfxFont)) {
240 ascent = gfxFont->getAscent() * fontSize;
241 descent = gfxFont->getDescent() * fontSize;
242 } else {
243 // this means that the PDF file draws text without a current font,
244 // which should never happen
245 ascent = 0.95 * fontSize;
246 descent = -0.35 * fontSize;
247 }
248 switch (rot) {
249 case 0:
250 yMin = y - ascent;
251 yMax = y - descent;
252 if (yMin == yMax) {
253 // this is a sanity check for a case that shouldn't happen -- but
254 // if it does happen, we want to avoid dividing by zero later
255 yMin = y;
256 yMax = y + 1;
257 }
258 base = y;
259 break;
260 case 1:
261 xMin = x + descent;
262 xMax = x + ascent;
263 if (xMin == xMax) {
264 // this is a sanity check for a case that shouldn't happen -- but
265 // if it does happen, we want to avoid dividing by zero later
266 xMin = x;
267 xMax = x + 1;
268 }
269 base = x;
270 break;
271 case 2:
272 yMin = y + descent;
273 yMax = y + ascent;
274 if (yMin == yMax) {
275 // this is a sanity check for a case that shouldn't happen -- but
276 // if it does happen, we want to avoid dividing by zero later
277 yMin = y;
278 yMax = y + 1;
279 }
280 base = y;
281 break;
282 case 3:
283 xMin = x - ascent;
284 xMax = x - descent;
285 if (xMin == xMax) {
286 // this is a sanity check for a case that shouldn't happen -- but
287 // if it does happen, we want to avoid dividing by zero later
288 xMin = x;
289 xMax = x + 1;
290 }
291 base = x;
292 break;
293 }
294 text = NULL;
295 charcode = NULL;
296 edge = NULL;
297 len = size = 0;
298 spaceAfter = gFalse;
299 next = NULL;
300
301 #if TEXTOUT_WORD_LIST
302 GfxRGB rgb;
303
304 if ((state->getRender() & 3) == 1) {
305 state->getStrokeRGB(&rgb);
306 } else {
307 state->getFillRGB(&rgb);
308 }
309 colorR = colToDbl(rgb.r);
310 colorG = colToDbl(rgb.g);
311 colorB = colToDbl(rgb.b);
312 #endif
313
314 underlined = gFalse;
315 link = NULL;
316 }
317
~TextWord()318 TextWord::~TextWord() {
319 gfree(text);
320 gfree(charcode);
321 gfree(edge);
322 }
323
addChar(GfxState * state,double x,double y,double dx,double dy,CharCode c,Unicode u)324 void TextWord::addChar(GfxState *state, double x, double y,
325 double dx, double dy, CharCode c, Unicode u) {
326 if (len == size) {
327 size += 16;
328 text = (Unicode *)greallocn(text, size, sizeof(Unicode));
329 charcode = (Unicode *)greallocn(charcode, size, sizeof(CharCode));
330 edge = (double *)greallocn(edge, (size + 1), sizeof(double));
331 }
332 text[len] = u;
333 charcode[len] = c;
334 switch (rot) {
335 case 0:
336 if (len == 0) {
337 xMin = x;
338 }
339 edge[len] = x;
340 xMax = edge[len+1] = x + dx;
341 break;
342 case 1:
343 if (len == 0) {
344 yMin = y;
345 }
346 edge[len] = y;
347 yMax = edge[len+1] = y + dy;
348 break;
349 case 2:
350 if (len == 0) {
351 xMax = x;
352 }
353 edge[len] = x;
354 xMin = edge[len+1] = x + dx;
355 break;
356 case 3:
357 if (len == 0) {
358 yMax = y;
359 }
360 edge[len] = y;
361 yMin = edge[len+1] = y + dy;
362 break;
363 }
364 ++len;
365 }
366
merge(TextWord * word)367 void TextWord::merge(TextWord *word) {
368 int i;
369
370 if (word->xMin < xMin) {
371 xMin = word->xMin;
372 }
373 if (word->yMin < yMin) {
374 yMin = word->yMin;
375 }
376 if (word->xMax > xMax) {
377 xMax = word->xMax;
378 }
379 if (word->yMax > yMax) {
380 yMax = word->yMax;
381 }
382 if (len + word->len > size) {
383 size = len + word->len;
384 text = (Unicode *)greallocn(text, size, sizeof(Unicode));
385 charcode = (CharCode *)greallocn(charcode, (size + 1), sizeof(CharCode));
386 edge = (double *)greallocn(edge, (size + 1), sizeof(double));
387 }
388 for (i = 0; i < word->len; ++i) {
389 text[len + i] = word->text[i];
390 charcode[len + i] = word->charcode[i];
391 edge[len + i] = word->edge[i];
392 }
393 edge[len + word->len] = word->edge[word->len];
394 len += word->len;
395 charLen += word->charLen;
396 }
397
primaryCmp(TextWord * word)398 inline int TextWord::primaryCmp(TextWord *word) {
399 double cmp;
400
401 cmp = 0; // make gcc happy
402 switch (rot) {
403 case 0:
404 cmp = xMin - word->xMin;
405 break;
406 case 1:
407 cmp = yMin - word->yMin;
408 break;
409 case 2:
410 cmp = word->xMax - xMax;
411 break;
412 case 3:
413 cmp = word->yMax - yMax;
414 break;
415 }
416 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
417 }
418
primaryDelta(TextWord * word)419 double TextWord::primaryDelta(TextWord *word) {
420 double delta;
421
422 delta = 0; // make gcc happy
423 switch (rot) {
424 case 0:
425 delta = word->xMin - xMax;
426 break;
427 case 1:
428 delta = word->yMin - yMax;
429 break;
430 case 2:
431 delta = xMin - word->xMax;
432 break;
433 case 3:
434 delta = yMin - word->yMax;
435 break;
436 }
437 return delta;
438 }
439
cmpYX(const void * p1,const void * p2)440 int TextWord::cmpYX(const void *p1, const void *p2) {
441 TextWord *word1 = *(TextWord **)p1;
442 TextWord *word2 = *(TextWord **)p2;
443 double cmp;
444
445 cmp = word1->yMin - word2->yMin;
446 if (cmp == 0) {
447 cmp = word1->xMin - word2->xMin;
448 }
449 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
450 }
451
452 #if TEXTOUT_WORD_LIST
453
getText()454 GooString *TextWord::getText() {
455 GooString *s;
456 UnicodeMap *uMap;
457 char buf[8];
458 int n, i;
459
460 s = new GooString();
461 if (!(uMap = globalParams->getTextEncoding())) {
462 return s;
463 }
464 for (i = 0; i < len; ++i) {
465 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
466 s->append(buf, n);
467 }
468 uMap->decRefCnt();
469 return s;
470 }
471
getCharBBox(int charIdx,double * xMinA,double * yMinA,double * xMaxA,double * yMaxA)472 void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA,
473 double *xMaxA, double *yMaxA) {
474 if (charIdx < 0 || charIdx >= len) {
475 return;
476 }
477 switch (rot) {
478 case 0:
479 *xMinA = edge[charIdx];
480 *xMaxA = edge[charIdx + 1];
481 *yMinA = yMin;
482 *yMaxA = yMax;
483 break;
484 case 1:
485 *xMinA = xMin;
486 *xMaxA = xMax;
487 *yMinA = edge[charIdx];
488 *yMaxA = edge[charIdx + 1];
489 break;
490 case 2:
491 *xMinA = edge[charIdx + 1];
492 *xMaxA = edge[charIdx];
493 *yMinA = yMin;
494 *yMaxA = yMax;
495 break;
496 case 3:
497 *xMinA = xMin;
498 *xMaxA = xMax;
499 *yMinA = edge[charIdx + 1];
500 *yMaxA = edge[charIdx];
501 break;
502 }
503 }
504
505 #endif // TEXTOUT_WORD_LIST
506
507 //------------------------------------------------------------------------
508 // TextPool
509 //------------------------------------------------------------------------
510
TextPool()511 TextPool::TextPool() {
512 minBaseIdx = 0;
513 maxBaseIdx = -1;
514 pool = NULL;
515 cursor = NULL;
516 cursorBaseIdx = -1;
517 }
518
~TextPool()519 TextPool::~TextPool() {
520 int baseIdx;
521 TextWord *word, *word2;
522
523 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
524 for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
525 word2 = word->next;
526 delete word;
527 }
528 }
529 gfree(pool);
530 }
531
getBaseIdx(double base)532 int TextPool::getBaseIdx(double base) {
533 int baseIdx;
534
535 baseIdx = (int)(base / textPoolStep);
536 if (baseIdx < minBaseIdx) {
537 return minBaseIdx;
538 }
539 if (baseIdx > maxBaseIdx) {
540 return maxBaseIdx;
541 }
542 return baseIdx;
543 }
544
addWord(TextWord * word)545 void TextPool::addWord(TextWord *word) {
546 TextWord **newPool;
547 int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
548 TextWord *w0, *w1;
549
550 // expand the array if needed
551 wordBaseIdx = (int)(word->base / textPoolStep);
552 if (minBaseIdx > maxBaseIdx) {
553 minBaseIdx = wordBaseIdx - 128;
554 maxBaseIdx = wordBaseIdx + 128;
555 pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1,
556 sizeof(TextWord *));
557 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
558 pool[baseIdx - minBaseIdx] = NULL;
559 }
560 } else if (wordBaseIdx < minBaseIdx) {
561 newMinBaseIdx = wordBaseIdx - 128;
562 newPool = (TextWord **)gmallocn(maxBaseIdx - newMinBaseIdx + 1,
563 sizeof(TextWord *));
564 for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
565 newPool[baseIdx - newMinBaseIdx] = NULL;
566 }
567 memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
568 (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
569 gfree(pool);
570 pool = newPool;
571 minBaseIdx = newMinBaseIdx;
572 } else if (wordBaseIdx > maxBaseIdx) {
573 newMaxBaseIdx = wordBaseIdx + 128;
574 pool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1,
575 sizeof(TextWord *));
576 for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
577 pool[baseIdx - minBaseIdx] = NULL;
578 }
579 maxBaseIdx = newMaxBaseIdx;
580 }
581
582 // insert the new word
583 if (cursor && wordBaseIdx == cursorBaseIdx &&
584 word->primaryCmp(cursor) > 0) {
585 w0 = cursor;
586 w1 = cursor->next;
587 } else {
588 w0 = NULL;
589 w1 = pool[wordBaseIdx - minBaseIdx];
590 }
591 for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
592 word->next = w1;
593 if (w0) {
594 w0->next = word;
595 } else {
596 pool[wordBaseIdx - minBaseIdx] = word;
597 }
598 cursor = word;
599 cursorBaseIdx = wordBaseIdx;
600 }
601
602 //------------------------------------------------------------------------
603 // TextLine
604 //------------------------------------------------------------------------
605
TextLine(TextBlock * blkA,int rotA,double baseA)606 TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
607 blk = blkA;
608 rot = rotA;
609 base = baseA;
610 words = lastWord = NULL;
611 text = NULL;
612 edge = NULL;
613 col = NULL;
614 len = 0;
615 convertedLen = 0;
616 hyphenated = gFalse;
617 next = NULL;
618 xMin = yMin = 0;
619 xMax = yMax = -1;
620 normalized = NULL;
621 normalized_len = 0;
622 normalized_idx = NULL;
623 }
624
~TextLine()625 TextLine::~TextLine() {
626 TextWord *word;
627
628 while (words) {
629 word = words;
630 words = words->next;
631 delete word;
632 }
633 gfree(text);
634 gfree(edge);
635 gfree(col);
636 if (normalized) {
637 gfree(normalized);
638 gfree(normalized_idx);
639 }
640 }
641
addWord(TextWord * word)642 void TextLine::addWord(TextWord *word) {
643 if (lastWord) {
644 lastWord->next = word;
645 } else {
646 words = word;
647 }
648 lastWord = word;
649
650 if (xMin > xMax) {
651 xMin = word->xMin;
652 xMax = word->xMax;
653 yMin = word->yMin;
654 yMax = word->yMax;
655 } else {
656 if (word->xMin < xMin) {
657 xMin = word->xMin;
658 }
659 if (word->xMax > xMax) {
660 xMax = word->xMax;
661 }
662 if (word->yMin < yMin) {
663 yMin = word->yMin;
664 }
665 if (word->yMax > yMax) {
666 yMax = word->yMax;
667 }
668 }
669 }
670
primaryDelta(TextLine * line)671 double TextLine::primaryDelta(TextLine *line) {
672 double delta;
673
674 delta = 0; // make gcc happy
675 switch (rot) {
676 case 0:
677 delta = line->xMin - xMax;
678 break;
679 case 1:
680 delta = line->yMin - yMax;
681 break;
682 case 2:
683 delta = xMin - line->xMax;
684 break;
685 case 3:
686 delta = yMin - line->yMax;
687 break;
688 }
689 return delta;
690 }
691
primaryCmp(TextLine * line)692 int TextLine::primaryCmp(TextLine *line) {
693 double cmp;
694
695 cmp = 0; // make gcc happy
696 switch (rot) {
697 case 0:
698 cmp = xMin - line->xMin;
699 break;
700 case 1:
701 cmp = yMin - line->yMin;
702 break;
703 case 2:
704 cmp = line->xMax - xMax;
705 break;
706 case 3:
707 cmp = line->yMax - yMax;
708 break;
709 }
710 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
711 }
712
secondaryCmp(TextLine * line)713 int TextLine::secondaryCmp(TextLine *line) {
714 double cmp;
715
716 cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
717 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
718 }
719
cmpYX(TextLine * line)720 int TextLine::cmpYX(TextLine *line) {
721 int cmp;
722
723 if ((cmp = secondaryCmp(line))) {
724 return cmp;
725 }
726 return primaryCmp(line);
727 }
728
cmpXY(const void * p1,const void * p2)729 int TextLine::cmpXY(const void *p1, const void *p2) {
730 TextLine *line1 = *(TextLine **)p1;
731 TextLine *line2 = *(TextLine **)p2;
732 int cmp;
733
734 if ((cmp = line1->primaryCmp(line2))) {
735 return cmp;
736 }
737 return line1->secondaryCmp(line2);
738 }
739
coalesce(UnicodeMap * uMap)740 void TextLine::coalesce(UnicodeMap *uMap) {
741 TextWord *word0, *word1;
742 double space, delta, minSpace;
743 GBool isUnicode;
744 char buf[8];
745 int i, j;
746
747 if (words->next) {
748
749 // compute the inter-word space threshold
750 if (words->len > 1 || words->next->len > 1) {
751 minSpace = 0;
752 } else {
753 minSpace = words->primaryDelta(words->next);
754 for (word0 = words->next, word1 = word0->next;
755 word1 && minSpace > 0;
756 word0 = word1, word1 = word0->next) {
757 if (word1->len > 1) {
758 minSpace = 0;
759 }
760 delta = word0->primaryDelta(word1);
761 if (delta < minSpace) {
762 minSpace = delta;
763 }
764 }
765 }
766 if (minSpace <= 0) {
767 space = maxCharSpacing * words->fontSize;
768 } else {
769 space = maxWideCharSpacingMul * minSpace;
770 if (space > maxWideCharSpacing * words->fontSize) {
771 space = maxWideCharSpacing * words->fontSize;
772 }
773 }
774
775 // merge words
776 word0 = words;
777 word1 = words->next;
778 while (word1) {
779 if (word0->primaryDelta(word1) >= space) {
780 word0->spaceAfter = gTrue;
781 word0 = word1;
782 word1 = word1->next;
783 } else if (word0->font == word1->font &&
784 word0->underlined == word1->underlined &&
785 fabs(word0->fontSize - word1->fontSize) <
786 maxWordFontSizeDelta * words->fontSize &&
787 word1->charPos == word0->charPos + word0->charLen) {
788 word0->merge(word1);
789 word0->next = word1->next;
790 delete word1;
791 word1 = word0->next;
792 } else {
793 word0 = word1;
794 word1 = word1->next;
795 }
796 }
797 }
798
799 // build the line text
800 isUnicode = uMap ? uMap->isUnicode() : gFalse;
801 len = 0;
802 for (word1 = words; word1; word1 = word1->next) {
803 len += word1->len;
804 if (word1->spaceAfter) {
805 ++len;
806 }
807 }
808 text = (Unicode *)gmallocn(len, sizeof(Unicode));
809 edge = (double *)gmallocn(len + 1, sizeof(double));
810 i = 0;
811 for (word1 = words; word1; word1 = word1->next) {
812 for (j = 0; j < word1->len; ++j) {
813 text[i] = word1->text[j];
814 edge[i] = word1->edge[j];
815 ++i;
816 }
817 edge[i] = word1->edge[word1->len];
818 if (word1->spaceAfter) {
819 text[i] = (Unicode)0x0020;
820 ++i;
821 }
822 }
823
824 // compute convertedLen and set up the col array
825 col = (int *)gmallocn(len + 1, sizeof(int));
826 convertedLen = 0;
827 for (i = 0; i < len; ++i) {
828 col[i] = convertedLen;
829 if (isUnicode) {
830 ++convertedLen;
831 } else if (uMap) {
832 convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
833 }
834 }
835 col[len] = convertedLen;
836
837 // check for hyphen at end of line
838 //~ need to check for other chars used as hyphens
839 hyphenated = text[len - 1] == (Unicode)'-';
840 }
841
842 //------------------------------------------------------------------------
843 // TextLineFrag
844 //------------------------------------------------------------------------
845
846 class TextLineFrag {
847 public:
848
849 TextLine *line; // the line object
850 int start, len; // offset and length of this fragment
851 // (in Unicode chars)
852 double xMin, xMax; // bounding box coordinates
853 double yMin, yMax;
854 double base; // baseline virtual coordinate
855 int col; // first column
856
857 void init(TextLine *lineA, int startA, int lenA);
858 void computeCoords(GBool oneRot);
859
860 static int cmpYXPrimaryRot(const void *p1, const void *p2);
861 static int cmpYXLineRot(const void *p1, const void *p2);
862 static int cmpXYLineRot(const void *p1, const void *p2);
863 static int cmpXYColumnPrimaryRot(const void *p1, const void *p2);
864 static int cmpXYColumnLineRot(const void *p1, const void *p2);
865 };
866
init(TextLine * lineA,int startA,int lenA)867 void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
868 line = lineA;
869 start = startA;
870 len = lenA;
871 col = line->col[start];
872 }
873
computeCoords(GBool oneRot)874 void TextLineFrag::computeCoords(GBool oneRot) {
875 TextBlock *blk;
876 double d0, d1, d2, d3, d4;
877
878 if (oneRot) {
879
880 switch (line->rot) {
881 case 0:
882 xMin = line->edge[start];
883 xMax = line->edge[start + len];
884 yMin = line->yMin;
885 yMax = line->yMax;
886 break;
887 case 1:
888 xMin = line->xMin;
889 xMax = line->xMax;
890 yMin = line->edge[start];
891 yMax = line->edge[start + len];
892 break;
893 case 2:
894 xMin = line->edge[start + len];
895 xMax = line->edge[start];
896 yMin = line->yMin;
897 yMax = line->yMax;
898 break;
899 case 3:
900 xMin = line->xMin;
901 xMax = line->xMax;
902 yMin = line->edge[start + len];
903 yMax = line->edge[start];
904 break;
905 }
906 base = line->base;
907
908 } else {
909
910 if (line->rot == 0 && line->blk->page->primaryRot == 0) {
911
912 xMin = line->edge[start];
913 xMax = line->edge[start + len];
914 yMin = line->yMin;
915 yMax = line->yMax;
916 base = line->base;
917
918 } else {
919
920 blk = line->blk;
921 d0 = line->edge[start];
922 d1 = line->edge[start + len];
923 d2 = d3 = d4 = 0; // make gcc happy
924
925 switch (line->rot) {
926 case 0:
927 d2 = line->yMin;
928 d3 = line->yMax;
929 d4 = line->base;
930 d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
931 d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
932 d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
933 d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
934 d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
935 break;
936 case 1:
937 d2 = line->xMax;
938 d3 = line->xMin;
939 d4 = line->base;
940 d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
941 d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
942 d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
943 d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
944 d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
945 break;
946 case 2:
947 d2 = line->yMax;
948 d3 = line->yMin;
949 d4 = line->base;
950 d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
951 d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
952 d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
953 d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
954 d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
955 break;
956 case 3:
957 d2 = line->xMin;
958 d3 = line->xMax;
959 d4 = line->base;
960 d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
961 d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
962 d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
963 d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
964 d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
965 break;
966 }
967
968 switch (line->blk->page->primaryRot) {
969 case 0:
970 xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
971 xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
972 yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
973 yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
974 base = blk->yMin + base * (blk->yMax - blk->yMin);
975 break;
976 case 1:
977 xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
978 xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
979 yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
980 yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
981 base = blk->xMax - d4 * (blk->xMax - blk->xMin);
982 break;
983 case 2:
984 xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
985 xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
986 yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
987 yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
988 base = blk->yMax - d4 * (blk->yMax - blk->yMin);
989 break;
990 case 3:
991 xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
992 xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
993 yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
994 yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
995 base = blk->xMin + d4 * (blk->xMax - blk->xMin);
996 break;
997 }
998
999 }
1000 }
1001 }
1002
cmpYXPrimaryRot(const void * p1,const void * p2)1003 int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
1004 TextLineFrag *frag1 = (TextLineFrag *)p1;
1005 TextLineFrag *frag2 = (TextLineFrag *)p2;
1006 double cmp;
1007
1008 cmp = 0; // make gcc happy
1009 switch (frag1->line->blk->page->primaryRot) {
1010 case 0:
1011 if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) {
1012 cmp = frag1->xMin - frag2->xMin;
1013 }
1014 break;
1015 case 1:
1016 if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) {
1017 cmp = frag1->yMin - frag2->yMin;
1018 }
1019 break;
1020 case 2:
1021 if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) {
1022 cmp = frag2->xMax - frag1->xMax;
1023 }
1024 break;
1025 case 3:
1026 if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) {
1027 cmp = frag2->yMax - frag1->yMax;
1028 }
1029 break;
1030 }
1031 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1032 }
1033
cmpYXLineRot(const void * p1,const void * p2)1034 int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
1035 TextLineFrag *frag1 = (TextLineFrag *)p1;
1036 TextLineFrag *frag2 = (TextLineFrag *)p2;
1037 double cmp;
1038
1039 cmp = 0; // make gcc happy
1040 switch (frag1->line->rot) {
1041 case 0:
1042 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1043 cmp = frag1->xMin - frag2->xMin;
1044 }
1045 break;
1046 case 1:
1047 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1048 cmp = frag1->yMin - frag2->yMin;
1049 }
1050 break;
1051 case 2:
1052 if ((cmp = frag2->yMin - frag1->yMin) == 0) {
1053 cmp = frag2->xMax - frag1->xMax;
1054 }
1055 break;
1056 case 3:
1057 if ((cmp = frag1->xMax - frag2->xMax) == 0) {
1058 cmp = frag2->yMax - frag1->yMax;
1059 }
1060 break;
1061 }
1062 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1063 }
1064
cmpXYLineRot(const void * p1,const void * p2)1065 int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
1066 TextLineFrag *frag1 = (TextLineFrag *)p1;
1067 TextLineFrag *frag2 = (TextLineFrag *)p2;
1068 double cmp;
1069
1070 cmp = 0; // make gcc happy
1071 switch (frag1->line->rot) {
1072 case 0:
1073 if ((cmp = frag1->xMin - frag2->xMin) == 0) {
1074 cmp = frag1->yMin - frag2->yMin;
1075 }
1076 break;
1077 case 1:
1078 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
1079 cmp = frag2->xMax - frag1->xMax;
1080 }
1081 break;
1082 case 2:
1083 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
1084 cmp = frag2->yMin - frag1->yMin;
1085 }
1086 break;
1087 case 3:
1088 if ((cmp = frag2->yMax - frag1->yMax) == 0) {
1089 cmp = frag1->xMax - frag2->xMax;
1090 }
1091 break;
1092 }
1093 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1094 }
1095
cmpXYColumnPrimaryRot(const void * p1,const void * p2)1096 int TextLineFrag::cmpXYColumnPrimaryRot(const void *p1, const void *p2) {
1097 TextLineFrag *frag1 = (TextLineFrag *)p1;
1098 TextLineFrag *frag2 = (TextLineFrag *)p2;
1099 double cmp;
1100
1101 // if columns overlap, compare y values
1102 if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
1103 frag2->line->col[frag2->start]) &&
1104 frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
1105 frag1->line->col[frag1->start])) {
1106 cmp = 0; // make gcc happy
1107 switch (frag1->line->blk->page->primaryRot) {
1108 case 0: cmp = frag1->yMin - frag2->yMin; break;
1109 case 1: cmp = frag2->xMax - frag1->xMax; break;
1110 case 2: cmp = frag2->yMin - frag1->yMin; break;
1111 case 3: cmp = frag1->xMax - frag2->xMax; break;
1112 }
1113 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1114 }
1115
1116 // otherwise, compare starting column
1117 return frag1->col - frag2->col;
1118 }
1119
cmpXYColumnLineRot(const void * p1,const void * p2)1120 int TextLineFrag::cmpXYColumnLineRot(const void *p1, const void *p2) {
1121 TextLineFrag *frag1 = (TextLineFrag *)p1;
1122 TextLineFrag *frag2 = (TextLineFrag *)p2;
1123 double cmp;
1124
1125 // if columns overlap, compare y values
1126 if (frag1->col < frag2->col + (frag2->line->col[frag2->start + frag2->len] -
1127 frag2->line->col[frag2->start]) &&
1128 frag2->col < frag1->col + (frag1->line->col[frag1->start + frag1->len] -
1129 frag1->line->col[frag1->start])) {
1130 cmp = 0; // make gcc happy
1131 switch (frag1->line->rot) {
1132 case 0: cmp = frag1->yMin - frag2->yMin; break;
1133 case 1: cmp = frag2->xMax - frag1->xMax; break;
1134 case 2: cmp = frag2->yMin - frag1->yMin; break;
1135 case 3: cmp = frag1->xMax - frag2->xMax; break;
1136 }
1137 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1138 }
1139
1140 // otherwise, compare starting column
1141 return frag1->col - frag2->col;
1142 }
1143
1144 //------------------------------------------------------------------------
1145 // TextBlock
1146 //------------------------------------------------------------------------
1147
TextBlock(TextPage * pageA,int rotA)1148 TextBlock::TextBlock(TextPage *pageA, int rotA) {
1149 page = pageA;
1150 rot = rotA;
1151 xMin = yMin = 0;
1152 xMax = yMax = -1;
1153 priMin = 0;
1154 priMax = page->pageWidth;
1155 pool = new TextPool();
1156 lines = NULL;
1157 curLine = NULL;
1158 next = NULL;
1159 stackNext = NULL;
1160 tableId = -1;
1161 tableEnd = gFalse;
1162 }
1163
~TextBlock()1164 TextBlock::~TextBlock() {
1165 TextLine *line;
1166
1167 delete pool;
1168 while (lines) {
1169 line = lines;
1170 lines = lines->next;
1171 delete line;
1172 }
1173 }
1174
addWord(TextWord * word)1175 void TextBlock::addWord(TextWord *word) {
1176 pool->addWord(word);
1177 if (xMin > xMax) {
1178 xMin = word->xMin;
1179 xMax = word->xMax;
1180 yMin = word->yMin;
1181 yMax = word->yMax;
1182 } else {
1183 if (word->xMin < xMin) {
1184 xMin = word->xMin;
1185 }
1186 if (word->xMax > xMax) {
1187 xMax = word->xMax;
1188 }
1189 if (word->yMin < yMin) {
1190 yMin = word->yMin;
1191 }
1192 if (word->yMax > yMax) {
1193 yMax = word->yMax;
1194 }
1195 }
1196 }
1197
coalesce(UnicodeMap * uMap)1198 void TextBlock::coalesce(UnicodeMap *uMap) {
1199 TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1200 TextLine *line, *line0, *line1;
1201 int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1202 int baseIdx, bestWordBaseIdx, idx0, idx1;
1203 double minBase, maxBase;
1204 double fontSize, delta, priDelta, secDelta;
1205 TextLine **lineArray;
1206 GBool found;
1207 int col1, col2;
1208 int i, j, k;
1209
1210 // discard duplicated text (fake boldface, drop shadows)
1211 for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1212 word0 = pool->getPool(idx0);
1213 while (word0) {
1214 priDelta = dupMaxPriDelta * word0->fontSize;
1215 secDelta = dupMaxSecDelta * word0->fontSize;
1216 if (rot == 0 || rot == 3) {
1217 maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1218 } else {
1219 maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
1220 }
1221 found = gFalse;
1222 word1 = word2 = NULL; // make gcc happy
1223 for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1224 if (idx1 == idx0) {
1225 word1 = word0;
1226 word2 = word0->next;
1227 } else {
1228 word1 = NULL;
1229 word2 = pool->getPool(idx1);
1230 }
1231 for (; word2; word1 = word2, word2 = word2->next) {
1232 if (word2->len == word0->len &&
1233 !memcmp(word2->text, word0->text,
1234 word0->len * sizeof(Unicode))) {
1235 switch (rot) {
1236 case 0:
1237 case 2:
1238 found = fabs(word0->xMin - word2->xMin) < priDelta &&
1239 fabs(word0->xMax - word2->xMax) < priDelta &&
1240 fabs(word0->yMin - word2->yMin) < secDelta &&
1241 fabs(word0->yMax - word2->yMax) < secDelta;
1242 break;
1243 case 1:
1244 case 3:
1245 found = fabs(word0->xMin - word2->xMin) < secDelta &&
1246 fabs(word0->xMax - word2->xMax) < secDelta &&
1247 fabs(word0->yMin - word2->yMin) < priDelta &&
1248 fabs(word0->yMax - word2->yMax) < priDelta;
1249 break;
1250 }
1251 }
1252 if (found) {
1253 break;
1254 }
1255 }
1256 if (found) {
1257 break;
1258 }
1259 }
1260 if (found) {
1261 if (word1) {
1262 word1->next = word2->next;
1263 } else {
1264 pool->setPool(idx1, word2->next);
1265 }
1266 delete word2;
1267 } else {
1268 word0 = word0->next;
1269 }
1270 }
1271 }
1272
1273 // build the lines
1274 curLine = NULL;
1275 poolMinBaseIdx = pool->minBaseIdx;
1276 charCount = 0;
1277 nLines = 0;
1278 while (1) {
1279
1280 // find the first non-empty line in the pool
1281 for (;
1282 poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
1283 ++poolMinBaseIdx) ;
1284 if (poolMinBaseIdx > pool->maxBaseIdx) {
1285 break;
1286 }
1287
1288 // look for the left-most word in the first four lines of the
1289 // pool -- this avoids starting with a superscript word
1290 startBaseIdx = poolMinBaseIdx;
1291 for (baseIdx = poolMinBaseIdx + 1;
1292 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
1293 ++baseIdx) {
1294 if (!pool->getPool(baseIdx)) {
1295 continue;
1296 }
1297 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
1298 < 0) {
1299 startBaseIdx = baseIdx;
1300 }
1301 }
1302
1303 // create a new line
1304 word0 = pool->getPool(startBaseIdx);
1305 pool->setPool(startBaseIdx, word0->next);
1306 word0->next = NULL;
1307 line = new TextLine(this, word0->rot, word0->base);
1308 line->addWord(word0);
1309 lastWord = word0;
1310
1311 // compute the search range
1312 fontSize = word0->fontSize;
1313 minBase = word0->base - maxIntraLineDelta * fontSize;
1314 maxBase = word0->base + maxIntraLineDelta * fontSize;
1315 minBaseIdx = pool->getBaseIdx(minBase);
1316 maxBaseIdx = pool->getBaseIdx(maxBase);
1317
1318 // find the rest of the words in this line
1319 while (1) {
1320
1321 // find the left-most word whose baseline is in the range for
1322 // this line
1323 bestWordBaseIdx = 0;
1324 bestWord0 = bestWord1 = NULL;
1325 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
1326 for (word0 = NULL, word1 = pool->getPool(baseIdx);
1327 word1;
1328 word0 = word1, word1 = word1->next) {
1329 if (word1->base >= minBase &&
1330 word1->base <= maxBase &&
1331 (delta = lastWord->primaryDelta(word1)) >=
1332 minCharSpacing * fontSize) {
1333 if (delta < maxWordSpacing * fontSize &&
1334 (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1335 bestWordBaseIdx = baseIdx;
1336 bestWord0 = word0;
1337 bestWord1 = word1;
1338 }
1339 break;
1340 }
1341 }
1342 }
1343 if (!bestWord1) {
1344 break;
1345 }
1346
1347 // remove it from the pool, and add it to the line
1348 if (bestWord0) {
1349 bestWord0->next = bestWord1->next;
1350 } else {
1351 pool->setPool(bestWordBaseIdx, bestWord1->next);
1352 }
1353 bestWord1->next = NULL;
1354 line->addWord(bestWord1);
1355 lastWord = bestWord1;
1356 }
1357
1358 // add the line
1359 if (curLine && line->cmpYX(curLine) > 0) {
1360 line0 = curLine;
1361 line1 = curLine->next;
1362 } else {
1363 line0 = NULL;
1364 line1 = lines;
1365 }
1366 for (;
1367 line1 && line->cmpYX(line1) > 0;
1368 line0 = line1, line1 = line1->next) ;
1369 if (line0) {
1370 line0->next = line;
1371 } else {
1372 lines = line;
1373 }
1374 line->next = line1;
1375 curLine = line;
1376 line->coalesce(uMap);
1377 charCount += line->len;
1378 ++nLines;
1379 }
1380
1381 // sort lines into xy order for column assignment
1382 lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *));
1383 for (line = lines, i = 0; line; line = line->next, ++i) {
1384 lineArray[i] = line;
1385 }
1386 qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1387
1388 // column assignment
1389 nColumns = 0;
1390 for (i = 0; i < nLines; ++i) {
1391 line0 = lineArray[i];
1392 col1 = 0;
1393 for (j = 0; j < i; ++j) {
1394 line1 = lineArray[j];
1395 if (line1->primaryDelta(line0) >= 0) {
1396 col2 = line1->col[line1->len] + 1;
1397 } else {
1398 k = 0; // make gcc happy
1399 switch (rot) {
1400 case 0:
1401 for (k = 0;
1402 k < line1->len &&
1403 line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1404 ++k) ;
1405 break;
1406 case 1:
1407 for (k = 0;
1408 k < line1->len &&
1409 line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1410 ++k) ;
1411 break;
1412 case 2:
1413 for (k = 0;
1414 k < line1->len &&
1415 line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1416 ++k) ;
1417 break;
1418 case 3:
1419 for (k = 0;
1420 k < line1->len &&
1421 line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1422 ++k) ;
1423 break;
1424 }
1425 col2 = line1->col[k];
1426 }
1427 if (col2 > col1) {
1428 col1 = col2;
1429 }
1430 }
1431 for (k = 0; k <= line0->len; ++k) {
1432 line0->col[k] += col1;
1433 }
1434 if (line0->col[line0->len] > nColumns) {
1435 nColumns = line0->col[line0->len];
1436 }
1437 }
1438 gfree(lineArray);
1439 }
1440
updatePriMinMax(TextBlock * blk)1441 void TextBlock::updatePriMinMax(TextBlock *blk) {
1442 double newPriMin, newPriMax;
1443 GBool gotPriMin, gotPriMax;
1444
1445 gotPriMin = gotPriMax = gFalse;
1446 newPriMin = newPriMax = 0; // make gcc happy
1447 switch (page->primaryRot) {
1448 case 0:
1449 case 2:
1450 if (blk->yMin < yMax && blk->yMax > yMin) {
1451 if (blk->xMin < xMin) {
1452 newPriMin = blk->xMax;
1453 gotPriMin = gTrue;
1454 }
1455 if (blk->xMax > xMax) {
1456 newPriMax = blk->xMin;
1457 gotPriMax = gTrue;
1458 }
1459 }
1460 break;
1461 case 1:
1462 case 3:
1463 if (blk->xMin < xMax && blk->xMax > xMin) {
1464 if (blk->yMin < yMin) {
1465 newPriMin = blk->yMax;
1466 gotPriMin = gTrue;
1467 }
1468 if (blk->yMax > yMax) {
1469 newPriMax = blk->yMin;
1470 gotPriMax = gTrue;
1471 }
1472 }
1473 break;
1474 }
1475 if (gotPriMin) {
1476 if (newPriMin > xMin) {
1477 newPriMin = xMin;
1478 }
1479 if (newPriMin > priMin) {
1480 priMin = newPriMin;
1481 }
1482 }
1483 if (gotPriMax) {
1484 if (newPriMax < xMax) {
1485 newPriMax = xMax;
1486 }
1487 if (newPriMax < priMax) {
1488 priMax = newPriMax;
1489 }
1490 }
1491 }
1492
cmpXYPrimaryRot(const void * p1,const void * p2)1493 int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
1494 TextBlock *blk1 = *(TextBlock **)p1;
1495 TextBlock *blk2 = *(TextBlock **)p2;
1496 double cmp;
1497
1498 cmp = 0; // make gcc happy
1499 switch (blk1->page->primaryRot) {
1500 case 0:
1501 if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1502 cmp = blk1->yMin - blk2->yMin;
1503 }
1504 break;
1505 case 1:
1506 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1507 cmp = blk2->xMax - blk1->xMax;
1508 }
1509 break;
1510 case 2:
1511 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1512 cmp = blk2->yMin - blk1->yMin;
1513 }
1514 break;
1515 case 3:
1516 if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1517 cmp = blk1->xMax - blk2->xMax;
1518 }
1519 break;
1520 }
1521 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1522 }
1523
cmpYXPrimaryRot(const void * p1,const void * p2)1524 int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
1525 TextBlock *blk1 = *(TextBlock **)p1;
1526 TextBlock *blk2 = *(TextBlock **)p2;
1527 double cmp;
1528
1529 cmp = 0; // make gcc happy
1530 switch (blk1->page->primaryRot) {
1531 case 0:
1532 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1533 cmp = blk1->xMin - blk2->xMin;
1534 }
1535 break;
1536 case 1:
1537 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1538 cmp = blk1->yMin - blk2->yMin;
1539 }
1540 break;
1541 case 2:
1542 if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1543 cmp = blk2->xMax - blk1->xMax;
1544 }
1545 break;
1546 case 3:
1547 if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1548 cmp = blk2->yMax - blk1->yMax;
1549 }
1550 break;
1551 }
1552 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1553 }
1554
primaryCmp(TextBlock * blk)1555 int TextBlock::primaryCmp(TextBlock *blk) {
1556 double cmp;
1557
1558 cmp = 0; // make gcc happy
1559 switch (rot) {
1560 case 0:
1561 cmp = xMin - blk->xMin;
1562 break;
1563 case 1:
1564 cmp = yMin - blk->yMin;
1565 break;
1566 case 2:
1567 cmp = blk->xMax - xMax;
1568 break;
1569 case 3:
1570 cmp = blk->yMax - yMax;
1571 break;
1572 }
1573 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1574 }
1575
secondaryDelta(TextBlock * blk)1576 double TextBlock::secondaryDelta(TextBlock *blk) {
1577 double delta;
1578
1579 delta = 0; // make gcc happy
1580 switch (rot) {
1581 case 0:
1582 delta = blk->yMin - yMax;
1583 break;
1584 case 1:
1585 delta = xMin - blk->xMax;
1586 break;
1587 case 2:
1588 delta = yMin - blk->yMax;
1589 break;
1590 case 3:
1591 delta = blk->xMin - xMax;
1592 break;
1593 }
1594 return delta;
1595 }
1596
isBelow(TextBlock * blk)1597 GBool TextBlock::isBelow(TextBlock *blk) {
1598 GBool below;
1599
1600 below = gFalse; // make gcc happy
1601 switch (page->primaryRot) {
1602 case 0:
1603 below = xMin >= blk->priMin && xMax <= blk->priMax &&
1604 yMin > blk->yMin;
1605 break;
1606 case 1:
1607 below = yMin >= blk->priMin && yMax <= blk->priMax &&
1608 xMax < blk->xMax;
1609 break;
1610 case 2:
1611 below = xMin >= blk->priMin && xMax <= blk->priMax &&
1612 yMax < blk->yMax;
1613 break;
1614 case 3:
1615 below = yMin >= blk->priMin && yMax <= blk->priMax &&
1616 xMin > blk->xMin;
1617 break;
1618 }
1619
1620 return below;
1621 }
1622
isBeforeByRule1(TextBlock * blk1)1623 GBool TextBlock::isBeforeByRule1(TextBlock *blk1) {
1624 GBool before = gFalse;
1625 GBool overlap = gFalse;
1626
1627 switch (this->page->primaryRot) {
1628 case 0:
1629 case 2:
1630 overlap = ((this->ExMin <= blk1->ExMin) &&
1631 (blk1->ExMin <= this->ExMax)) ||
1632 ((blk1->ExMin <= this->ExMin) &&
1633 (this->ExMin <= blk1->ExMax));
1634 break;
1635 case 1:
1636 case 3:
1637 overlap = ((this->EyMin <= blk1->EyMin) &&
1638 (blk1->EyMin <= this->EyMax)) ||
1639 ((blk1->EyMin <= this->EyMin) &&
1640 (this->EyMin <= blk1->EyMax));
1641 break;
1642 }
1643 switch (this->page->primaryRot) {
1644 case 0:
1645 before = overlap && this->EyMin < blk1->EyMin;
1646 break;
1647 case 1:
1648 before = overlap && this->ExMax > blk1->ExMax;
1649 break;
1650 case 2:
1651 before = overlap && this->EyMax > blk1->EyMax;
1652 break;
1653 case 3:
1654 before = overlap && this->ExMin < blk1->ExMin;
1655 break;
1656 }
1657 return before;
1658 }
1659
isBeforeByRule2(TextBlock * blk1)1660 GBool TextBlock::isBeforeByRule2(TextBlock *blk1) {
1661 double cmp = 0;
1662 int rotLR = rot;
1663
1664 if (!page->primaryLR) {
1665 rotLR = (rotLR + 2) % 4;
1666 }
1667
1668 switch (rotLR) {
1669 case 0:
1670 cmp = ExMax - blk1->ExMin;
1671 break;
1672 case 1:
1673 cmp = EyMin - blk1->EyMax;
1674 break;
1675 case 2:
1676 cmp = blk1->ExMax - ExMin;
1677 break;
1678 case 3:
1679 cmp = blk1->EyMin - EyMax;
1680 break;
1681 }
1682 return cmp <= 0;
1683 }
1684
1685 // Sort into reading order by performing a topological sort using the rules
1686 // given in "High Performance Document Layout Analysis", T.M. Breuel, 2003.
1687 // See http://pubs.iupr.org/#2003-breuel-sdiut
1688 // Topological sort is done by depth first search, see
1689 // http://en.wikipedia.org/wiki/Topological_sorting
visitDepthFirst(TextBlock * blkList,int pos1,TextBlock ** sorted,int sortPos,GBool * visited)1690 int TextBlock::visitDepthFirst(TextBlock *blkList, int pos1,
1691 TextBlock **sorted, int sortPos,
1692 GBool* visited) {
1693 int pos2;
1694 TextBlock *blk1, *blk2, *blk3;
1695 GBool before;
1696
1697 if (visited[pos1]) {
1698 return sortPos;
1699 }
1700
1701 blk1 = this;
1702
1703 #if 0 // for debugging
1704 printf("visited: %d %.2f..%.2f %.2f..%.2f\n",
1705 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
1706 #endif
1707 visited[pos1] = gTrue;
1708 pos2 = -1;
1709 for (blk2 = blkList; blk2; blk2 = blk2->next) {
1710 pos2++;
1711 if (visited[pos2]) {
1712 // skip visited nodes
1713 continue;
1714 }
1715 before = gFalse;
1716
1717 // is blk2 before blk1? (for table entries)
1718 if (blk1->tableId >= 0 && blk1->tableId == blk2->tableId) {
1719 if (page->primaryLR) {
1720 if (blk2->xMax <= blk1->xMin &&
1721 blk2->yMin <= blk1->yMax &&
1722 blk2->yMax >= blk1->yMin)
1723 before = gTrue;
1724 } else {
1725 if (blk2->xMin >= blk1->xMax &&
1726 blk2->yMin <= blk1->yMax &&
1727 blk2->yMax >= blk1->yMin)
1728 before = gTrue;
1729 }
1730
1731 if (blk2->yMax <= blk1->yMin)
1732 before = gTrue;
1733 } else {
1734 if (blk2->isBeforeByRule1(blk1)) {
1735 // Rule (1) blk1 and blk2 overlap, and blk2 is above blk1.
1736 before = gTrue;
1737 #if 0 // for debugging
1738 printf("rule1: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
1739 blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax,
1740 blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
1741 #endif
1742 } else if (blk2->isBeforeByRule2(blk1)) {
1743 // Rule (2) blk2 left of blk1, and no intervening blk3
1744 // such that blk1 is before blk3 by rule 1,
1745 // and blk3 is before blk2 by rule 1.
1746 before = gTrue;
1747 for (blk3 = blkList; blk3; blk3 = blk3->next) {
1748 if (blk3 == blk2 || blk3 == blk1) {
1749 continue;
1750 }
1751 if (blk1->isBeforeByRule1(blk3) &&
1752 blk3->isBeforeByRule1(blk2)) {
1753 before = gFalse;
1754 break;
1755 }
1756 }
1757 #if 0 // for debugging
1758 if (before) {
1759 printf("rule2: %.2f..%.2f %.2f..%.2f %.2f..%.2f %.2f..%.2f\n",
1760 blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax,
1761 blk2->ExMin, blk2->ExMax, blk2->EyMin, blk2->EyMax);
1762 }
1763 #endif
1764 }
1765 }
1766 if (before) {
1767 // blk2 is before blk1, so it needs to be visited
1768 // before we can add blk1 to the sorted list.
1769 sortPos = blk2->visitDepthFirst(blkList, pos2, sorted, sortPos, visited);
1770 }
1771 }
1772 #if 0 // for debugging
1773 printf("sorted: %d %.2f..%.2f %.2f..%.2f\n",
1774 sortPos, blk1->ExMin, blk1->ExMax, blk1->EyMin, blk1->EyMax);
1775 #endif
1776 sorted[sortPos++] = blk1;
1777 return sortPos;
1778 }
1779
1780 //------------------------------------------------------------------------
1781 // TextFlow
1782 //------------------------------------------------------------------------
1783
TextFlow(TextPage * pageA,TextBlock * blk)1784 TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
1785 page = pageA;
1786 xMin = blk->xMin;
1787 xMax = blk->xMax;
1788 yMin = blk->yMin;
1789 yMax = blk->yMax;
1790 priMin = blk->priMin;
1791 priMax = blk->priMax;
1792 blocks = lastBlk = blk;
1793 next = NULL;
1794 }
1795
~TextFlow()1796 TextFlow::~TextFlow() {
1797 TextBlock *blk;
1798
1799 while (blocks) {
1800 blk = blocks;
1801 blocks = blocks->next;
1802 delete blk;
1803 }
1804 }
1805
addBlock(TextBlock * blk)1806 void TextFlow::addBlock(TextBlock *blk) {
1807 if (lastBlk) {
1808 lastBlk->next = blk;
1809 } else {
1810 blocks = blk;
1811 }
1812 lastBlk = blk;
1813 if (blk->xMin < xMin) {
1814 xMin = blk->xMin;
1815 }
1816 if (blk->xMax > xMax) {
1817 xMax = blk->xMax;
1818 }
1819 if (blk->yMin < yMin) {
1820 yMin = blk->yMin;
1821 }
1822 if (blk->yMax > yMax) {
1823 yMax = blk->yMax;
1824 }
1825 }
1826
blockFits(TextBlock * blk,TextBlock * prevBlk)1827 GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
1828 GBool fits;
1829
1830 // lower blocks must use smaller fonts
1831 if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
1832 return gFalse;
1833 }
1834
1835 fits = gFalse; // make gcc happy
1836 switch (page->primaryRot) {
1837 case 0:
1838 fits = blk->xMin >= priMin && blk->xMax <= priMax;
1839 break;
1840 case 1:
1841 fits = blk->yMin >= priMin && blk->yMax <= priMax;
1842 break;
1843 case 2:
1844 fits = blk->xMin >= priMin && blk->xMax <= priMax;
1845 break;
1846 case 3:
1847 fits = blk->yMin >= priMin && blk->yMax <= priMax;
1848 break;
1849 }
1850 return fits;
1851 }
1852
1853 #if TEXTOUT_WORD_LIST
1854
1855 //------------------------------------------------------------------------
1856 // TextWordList
1857 //------------------------------------------------------------------------
1858
TextWordList(TextPage * text,GBool physLayout)1859 TextWordList::TextWordList(TextPage *text, GBool physLayout) {
1860 TextFlow *flow;
1861 TextBlock *blk;
1862 TextLine *line;
1863 TextWord *word;
1864 TextWord **wordArray;
1865 int nWords, i;
1866
1867 words = new GooList();
1868
1869 if (text->rawOrder) {
1870 for (word = text->rawWords; word; word = word->next) {
1871 words->append(word);
1872 }
1873
1874 } else if (physLayout) {
1875 // this is inefficient, but it's also the least useful of these
1876 // three cases
1877 nWords = 0;
1878 for (flow = text->flows; flow; flow = flow->next) {
1879 for (blk = flow->blocks; blk; blk = blk->next) {
1880 for (line = blk->lines; line; line = line->next) {
1881 for (word = line->words; word; word = word->next) {
1882 ++nWords;
1883 }
1884 }
1885 }
1886 }
1887 wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *));
1888 i = 0;
1889 for (flow = text->flows; flow; flow = flow->next) {
1890 for (blk = flow->blocks; blk; blk = blk->next) {
1891 for (line = blk->lines; line; line = line->next) {
1892 for (word = line->words; word; word = word->next) {
1893 wordArray[i++] = word;
1894 }
1895 }
1896 }
1897 }
1898 qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
1899 for (i = 0; i < nWords; ++i) {
1900 words->append(wordArray[i]);
1901 }
1902 gfree(wordArray);
1903
1904 } else {
1905 for (flow = text->flows; flow; flow = flow->next) {
1906 for (blk = flow->blocks; blk; blk = blk->next) {
1907 for (line = blk->lines; line; line = line->next) {
1908 for (word = line->words; word; word = word->next) {
1909 words->append(word);
1910 }
1911 }
1912 }
1913 }
1914 }
1915 }
1916
~TextWordList()1917 TextWordList::~TextWordList() {
1918 delete words;
1919 }
1920
getLength()1921 int TextWordList::getLength() {
1922 return words->getLength();
1923 }
1924
get(int idx)1925 TextWord *TextWordList::get(int idx) {
1926 if (idx < 0 || idx >= words->getLength()) {
1927 return NULL;
1928 }
1929 return (TextWord *)words->get(idx);
1930 }
1931
1932 #endif // TEXTOUT_WORD_LIST
1933
1934 //------------------------------------------------------------------------
1935 // TextPage
1936 //------------------------------------------------------------------------
1937
TextPage(GBool rawOrderA)1938 TextPage::TextPage(GBool rawOrderA) {
1939 int rot;
1940
1941 refCnt = 1;
1942 rawOrder = rawOrderA;
1943 curWord = NULL;
1944 charPos = 0;
1945 curFont = NULL;
1946 curFontSize = 0;
1947 nest = 0;
1948 nTinyChars = 0;
1949 lastCharOverlap = gFalse;
1950 if (!rawOrder) {
1951 for (rot = 0; rot < 4; ++rot) {
1952 pools[rot] = new TextPool();
1953 }
1954 }
1955 flows = NULL;
1956 blocks = NULL;
1957 rawWords = NULL;
1958 rawLastWord = NULL;
1959 fonts = new GooList();
1960 lastFindXMin = lastFindYMin = 0;
1961 haveLastFind = gFalse;
1962 underlines = new GooList();
1963 links = new GooList();
1964 }
1965
~TextPage()1966 TextPage::~TextPage() {
1967 int rot;
1968
1969 clear();
1970 if (!rawOrder) {
1971 for (rot = 0; rot < 4; ++rot) {
1972 delete pools[rot];
1973 }
1974 }
1975 delete fonts;
1976 deleteGooList(underlines, TextUnderline);
1977 deleteGooList(links, TextLink);
1978 }
1979
incRefCnt()1980 void TextPage::incRefCnt() {
1981 refCnt++;
1982 }
1983
decRefCnt()1984 void TextPage::decRefCnt() {
1985 if (--refCnt == 0)
1986 delete this;
1987 }
1988
startPage(GfxState * state)1989 void TextPage::startPage(GfxState *state) {
1990 clear();
1991 if (state) {
1992 pageWidth = state->getPageWidth();
1993 pageHeight = state->getPageHeight();
1994 } else {
1995 pageWidth = pageHeight = 0;
1996 }
1997 }
1998
endPage()1999 void TextPage::endPage() {
2000 if (curWord) {
2001 endWord();
2002 }
2003 }
2004
clear()2005 void TextPage::clear() {
2006 int rot;
2007 TextFlow *flow;
2008 TextWord *word;
2009
2010 if (curWord) {
2011 delete curWord;
2012 curWord = NULL;
2013 }
2014 if (rawOrder) {
2015 while (rawWords) {
2016 word = rawWords;
2017 rawWords = rawWords->next;
2018 delete word;
2019 }
2020 } else {
2021 for (rot = 0; rot < 4; ++rot) {
2022 delete pools[rot];
2023 }
2024 while (flows) {
2025 flow = flows;
2026 flows = flows->next;
2027 delete flow;
2028 }
2029 gfree(blocks);
2030 }
2031 deleteGooList(fonts, TextFontInfo);
2032
2033 curWord = NULL;
2034 charPos = 0;
2035 curFont = NULL;
2036 curFontSize = 0;
2037 nest = 0;
2038 nTinyChars = 0;
2039 if (!rawOrder) {
2040 for (rot = 0; rot < 4; ++rot) {
2041 pools[rot] = new TextPool();
2042 }
2043 }
2044 flows = NULL;
2045 blocks = NULL;
2046 rawWords = NULL;
2047 rawLastWord = NULL;
2048 fonts = new GooList();
2049 }
2050
updateFont(GfxState * state)2051 void TextPage::updateFont(GfxState *state) {
2052 GfxFont *gfxFont;
2053 double *fm;
2054 char *name;
2055 int code, mCode, letterCode, anyCode;
2056 double w;
2057 int i;
2058
2059 // get the font info object
2060 curFont = NULL;
2061 for (i = 0; i < fonts->getLength(); ++i) {
2062 curFont = (TextFontInfo *)fonts->get(i);
2063 if (curFont->matches(state)) {
2064 break;
2065 }
2066 curFont = NULL;
2067 }
2068 if (!curFont) {
2069 curFont = new TextFontInfo(state);
2070 fonts->append(curFont);
2071 }
2072
2073 // adjust the font size
2074 gfxFont = state->getFont();
2075 curFontSize = state->getTransformedFontSize();
2076 if (gfxFont && gfxFont->getType() == fontType3) {
2077 // This is a hack which makes it possible to deal with some Type 3
2078 // fonts. The problem is that it's impossible to know what the
2079 // base coordinate system used in the font is without actually
2080 // rendering the font. This code tries to guess by looking at the
2081 // width of the character 'm' (which breaks if the font is a
2082 // subset that doesn't contain 'm').
2083 mCode = letterCode = anyCode = -1;
2084 for (code = 0; code < 256; ++code) {
2085 name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
2086 if (name && name[0] == 'm' && name[1] == '\0') {
2087 mCode = code;
2088 }
2089 if (letterCode < 0 && name && name[1] == '\0' &&
2090 ((name[0] >= 'A' && name[0] <= 'Z') ||
2091 (name[0] >= 'a' && name[0] <= 'z'))) {
2092 letterCode = code;
2093 }
2094 if (anyCode < 0 && name &&
2095 ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
2096 anyCode = code;
2097 }
2098 }
2099 if (mCode >= 0 &&
2100 (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
2101 // 0.6 is a generic average 'm' width -- yes, this is a hack
2102 curFontSize *= w / 0.6;
2103 } else if (letterCode >= 0 &&
2104 (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
2105 // even more of a hack: 0.5 is a generic letter width
2106 curFontSize *= w / 0.5;
2107 } else if (anyCode >= 0 &&
2108 (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
2109 // better than nothing: 0.5 is a generic character width
2110 curFontSize *= w / 0.5;
2111 }
2112 fm = gfxFont->getFontMatrix();
2113 if (fm[0] != 0) {
2114 curFontSize *= fabs(fm[3] / fm[0]);
2115 }
2116 }
2117 }
2118
beginWord(GfxState * state,double x0,double y0)2119 void TextPage::beginWord(GfxState *state, double x0, double y0) {
2120 GfxFont *gfxFont;
2121 double *fontm;
2122 double m[4], m2[4];
2123 int rot;
2124
2125 // This check is needed because Type 3 characters can contain
2126 // text-drawing operations (when TextPage is being used via
2127 // {X,Win}SplashOutputDev rather than TextOutputDev).
2128 if (curWord) {
2129 ++nest;
2130 return;
2131 }
2132
2133 // compute the rotation
2134 state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
2135 gfxFont = state->getFont();
2136 if (gfxFont && gfxFont->getType() == fontType3) {
2137 fontm = state->getFont()->getFontMatrix();
2138 m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
2139 m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
2140 m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
2141 m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
2142 m[0] = m2[0];
2143 m[1] = m2[1];
2144 m[2] = m2[2];
2145 m[3] = m2[3];
2146 }
2147 if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
2148 rot = (m[3] < 0) ? 0 : 2;
2149 } else {
2150 rot = (m[2] > 0) ? 1 : 3;
2151 }
2152
2153 curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize);
2154 }
2155
addChar(GfxState * state,double x,double y,double dx,double dy,CharCode c,int nBytes,Unicode * u,int uLen)2156 void TextPage::addChar(GfxState *state, double x, double y,
2157 double dx, double dy,
2158 CharCode c, int nBytes, Unicode *u, int uLen) {
2159 double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
2160 GBool overlap;
2161 int i;
2162
2163 // subtract char and word spacing from the dx,dy values
2164 sp = state->getCharSpace();
2165 if (c == (CharCode)0x20) {
2166 sp += state->getWordSpace();
2167 }
2168 state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
2169 dx -= dx2;
2170 dy -= dy2;
2171 state->transformDelta(dx, dy, &w1, &h1);
2172
2173 // throw away chars that aren't inside the page bounds
2174 // (and also do a sanity check on the character size)
2175 state->transform(x, y, &x1, &y1);
2176 if (x1 + w1 < 0 || x1 > pageWidth ||
2177 y1 + h1 < 0 || y1 > pageHeight ||
2178 w1 > pageWidth || h1 > pageHeight) {
2179 charPos += nBytes;
2180 return;
2181 }
2182
2183 // check the tiny chars limit
2184 if (!globalParams->getTextKeepTinyChars() &&
2185 fabs(w1) < 3 && fabs(h1) < 3) {
2186 if (++nTinyChars > 50000) {
2187 charPos += nBytes;
2188 return;
2189 }
2190 }
2191
2192 // break words at space character
2193 if (uLen == 1 && u[0] == (Unicode)0x20) {
2194 if (curWord) {
2195 ++curWord->charLen;
2196 }
2197 charPos += nBytes;
2198 endWord();
2199 return;
2200 }
2201
2202 // start a new word if:
2203 // (1) this character doesn't fall in the right place relative to
2204 // the end of the previous word (this places upper and lower
2205 // constraints on the position deltas along both the primary
2206 // and secondary axes), or
2207 // (2) this character overlaps the previous one (duplicated text), or
2208 // (3) the previous character was an overlap (we want each duplicated
2209 // character to be in a word by itself at this stage),
2210 // (4) the font size has changed
2211 if (curWord && curWord->len > 0) {
2212 base = sp = delta = 0; // make gcc happy
2213 switch (curWord->rot) {
2214 case 0:
2215 base = y1;
2216 sp = x1 - curWord->xMax;
2217 delta = x1 - curWord->edge[curWord->len - 1];
2218 break;
2219 case 1:
2220 base = x1;
2221 sp = y1 - curWord->yMax;
2222 delta = y1 - curWord->edge[curWord->len - 1];
2223 break;
2224 case 2:
2225 base = y1;
2226 sp = curWord->xMin - x1;
2227 delta = curWord->edge[curWord->len - 1] - x1;
2228 break;
2229 case 3:
2230 base = x1;
2231 sp = curWord->yMin - y1;
2232 delta = curWord->edge[curWord->len - 1] - y1;
2233 break;
2234 }
2235 overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&
2236 fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
2237 if (overlap || lastCharOverlap ||
2238 sp < -minDupBreakOverlap * curWord->fontSize ||
2239 sp > minWordBreakSpace * curWord->fontSize ||
2240 fabs(base - curWord->base) > 0.5 ||
2241 curFontSize != curWord->fontSize) {
2242 endWord();
2243 }
2244 lastCharOverlap = overlap;
2245 } else {
2246 lastCharOverlap = gFalse;
2247 }
2248
2249 if (uLen != 0) {
2250 // start a new word if needed
2251 if (!curWord) {
2252 beginWord(state, x, y);
2253 }
2254
2255 // page rotation and/or transform matrices can cause text to be
2256 // drawn in reverse order -- in this case, swap the begin/end
2257 // coordinates and break text into individual chars
2258 if ((curWord->rot == 0 && w1 < 0) ||
2259 (curWord->rot == 1 && h1 < 0) ||
2260 (curWord->rot == 2 && w1 > 0) ||
2261 (curWord->rot == 3 && h1 > 0)) {
2262 endWord();
2263 beginWord(state, x + dx, y + dy);
2264 x1 += w1;
2265 y1 += h1;
2266 w1 = -w1;
2267 h1 = -h1;
2268 }
2269
2270 // add the characters to the current word
2271 w1 /= uLen;
2272 h1 /= uLen;
2273 for (i = 0; i < uLen; ++i) {
2274 if (u[i] >= 0xd800 && u[i] < 0xdc00) { /* surrogate pair */
2275 if (i + 1 < uLen && u[i+1] >= 0xdc00 && u[i+1] < 0xe000) {
2276 /* next code is a low surrogate */
2277 Unicode uu = (((u[i] & 0x3ff) << 10) | (u[i+1] & 0x3ff)) + 0x10000;
2278 i++;
2279 curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, uu);
2280 } else {
2281 /* missing low surrogate
2282 replace it with REPLACEMENT CHARACTER (U+FFFD) */
2283 curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, 0xfffd);
2284 }
2285 } else if (u[i] >= 0xdc00 && u[i] < 0xe000) {
2286 /* invalid low surrogate
2287 replace it with REPLACEMENT CHARACTER (U+FFFD) */
2288 curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, 0xfffd);
2289 } else {
2290 curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, c, u[i]);
2291 }
2292 }
2293 }
2294 if (curWord) {
2295 curWord->charLen += nBytes;
2296 }
2297 charPos += nBytes;
2298 }
2299
endWord()2300 void TextPage::endWord() {
2301 // This check is needed because Type 3 characters can contain
2302 // text-drawing operations (when TextPage is being used via
2303 // {X,Win}SplashOutputDev rather than TextOutputDev).
2304 if (nest > 0) {
2305 --nest;
2306 return;
2307 }
2308
2309 if (curWord) {
2310 addWord(curWord);
2311 curWord = NULL;
2312 }
2313 }
2314
addWord(TextWord * word)2315 void TextPage::addWord(TextWord *word) {
2316 // throw away zero-length words -- they don't have valid xMin/xMax
2317 // values, and they're useless anyway
2318 if (word->len == 0) {
2319 delete word;
2320 return;
2321 }
2322
2323 if (rawOrder) {
2324 if (rawLastWord) {
2325 rawLastWord->next = word;
2326 } else {
2327 rawWords = word;
2328 }
2329 rawLastWord = word;
2330 } else {
2331 pools[word->rot]->addWord(word);
2332 }
2333 }
2334
addUnderline(double x0,double y0,double x1,double y1)2335 void TextPage::addUnderline(double x0, double y0, double x1, double y1) {
2336 underlines->append(new TextUnderline(x0, y0, x1, y1));
2337 }
2338
addLink(int xMin,int yMin,int xMax,int yMax,Link * link)2339 void TextPage::addLink(int xMin, int yMin, int xMax, int yMax, Link *link) {
2340 links->append(new TextLink(xMin, yMin, xMax, yMax, link));
2341 }
2342
coalesce(GBool physLayout,GBool doHTML)2343 void TextPage::coalesce(GBool physLayout, GBool doHTML) {
2344 UnicodeMap *uMap;
2345 TextPool *pool;
2346 TextWord *word0, *word1, *word2;
2347 TextLine *line;
2348 TextBlock *blkList, *blk, *lastBlk, *blk0, *blk1, *blk2;
2349 TextFlow *flow, *lastFlow;
2350 TextUnderline *underline;
2351 TextLink *link;
2352 int rot, poolMinBaseIdx, baseIdx, startBaseIdx, endBaseIdx;
2353 double minBase, maxBase, newMinBase, newMaxBase;
2354 double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
2355 GBool found;
2356 int count[4];
2357 int lrCount;
2358 int col1, col2;
2359 int i, j, n;
2360
2361 if (rawOrder) {
2362 primaryRot = 0;
2363 primaryLR = gTrue;
2364 return;
2365 }
2366
2367 uMap = globalParams->getTextEncoding();
2368 blkList = NULL;
2369 lastBlk = NULL;
2370 nBlocks = 0;
2371 primaryRot = -1;
2372
2373 #if 0 // for debugging
2374 printf("*** initial words ***\n");
2375 for (rot = 0; rot < 4; ++rot) {
2376 pool = pools[rot];
2377 for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
2378 for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
2379 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d link=%p '",
2380 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2381 word0->base, word0->fontSize, rot*90, word0->link);
2382 for (i = 0; i < word0->len; ++i) {
2383 fputc(word0->text[i] & 0xff, stdout);
2384 }
2385 printf("'\n");
2386 }
2387 }
2388 }
2389 printf("\n");
2390 #endif
2391
2392 #if 0 //~ for debugging
2393 for (i = 0; i < underlines->getLength(); ++i) {
2394 underline = (TextUnderline *)underlines->get(i);
2395 printf("underline: x=%g..%g y=%g..%g horiz=%d\n",
2396 underline->x0, underline->x1, underline->y0, underline->y1,
2397 underline->horiz);
2398 }
2399 #endif
2400
2401 if (doHTML) {
2402
2403 //----- handle underlining
2404 for (i = 0; i < underlines->getLength(); ++i) {
2405 underline = (TextUnderline *)underlines->get(i);
2406 if (underline->horiz) {
2407 // rot = 0
2408 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2409 startBaseIdx = pools[0]->getBaseIdx(underline->y0 + minUnderlineGap);
2410 endBaseIdx = pools[0]->getBaseIdx(underline->y0 + maxUnderlineGap);
2411 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2412 for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2413 //~ need to check the y value against the word baseline
2414 if (underline->x0 < word0->xMin + underlineSlack &&
2415 word0->xMax - underlineSlack < underline->x1) {
2416 word0->underlined = gTrue;
2417 }
2418 }
2419 }
2420 }
2421
2422 // rot = 2
2423 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2424 startBaseIdx = pools[2]->getBaseIdx(underline->y0 - maxUnderlineGap);
2425 endBaseIdx = pools[2]->getBaseIdx(underline->y0 - minUnderlineGap);
2426 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2427 for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2428 if (underline->x0 < word0->xMin + underlineSlack &&
2429 word0->xMax - underlineSlack < underline->x1) {
2430 word0->underlined = gTrue;
2431 }
2432 }
2433 }
2434 }
2435 } else {
2436 // rot = 1
2437 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2438 startBaseIdx = pools[1]->getBaseIdx(underline->x0 - maxUnderlineGap);
2439 endBaseIdx = pools[1]->getBaseIdx(underline->x0 - minUnderlineGap);
2440 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2441 for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2442 if (underline->y0 < word0->yMin + underlineSlack &&
2443 word0->yMax - underlineSlack < underline->y1) {
2444 word0->underlined = gTrue;
2445 }
2446 }
2447 }
2448 }
2449
2450 // rot = 3
2451 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2452 startBaseIdx = pools[3]->getBaseIdx(underline->x0 + minUnderlineGap);
2453 endBaseIdx = pools[3]->getBaseIdx(underline->x0 + maxUnderlineGap);
2454 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2455 for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2456 if (underline->y0 < word0->yMin + underlineSlack &&
2457 word0->yMax - underlineSlack < underline->y1) {
2458 word0->underlined = gTrue;
2459 }
2460 }
2461 }
2462 }
2463 }
2464 }
2465
2466 //----- handle links
2467 for (i = 0; i < links->getLength(); ++i) {
2468 link = (TextLink *)links->get(i);
2469
2470 // rot = 0
2471 if (pools[0]->minBaseIdx <= pools[0]->maxBaseIdx) {
2472 startBaseIdx = pools[0]->getBaseIdx(link->yMin);
2473 endBaseIdx = pools[0]->getBaseIdx(link->yMax);
2474 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2475 for (word0 = pools[0]->getPool(j); word0; word0 = word0->next) {
2476 if (link->xMin < word0->xMin + hyperlinkSlack &&
2477 word0->xMax - hyperlinkSlack < link->xMax &&
2478 link->yMin < word0->yMin + hyperlinkSlack &&
2479 word0->yMax - hyperlinkSlack < link->yMax) {
2480 word0->link = link->link;
2481 }
2482 }
2483 }
2484 }
2485
2486 // rot = 2
2487 if (pools[2]->minBaseIdx <= pools[2]->maxBaseIdx) {
2488 startBaseIdx = pools[2]->getBaseIdx(link->yMin);
2489 endBaseIdx = pools[2]->getBaseIdx(link->yMax);
2490 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2491 for (word0 = pools[2]->getPool(j); word0; word0 = word0->next) {
2492 if (link->xMin < word0->xMin + hyperlinkSlack &&
2493 word0->xMax - hyperlinkSlack < link->xMax &&
2494 link->yMin < word0->yMin + hyperlinkSlack &&
2495 word0->yMax - hyperlinkSlack < link->yMax) {
2496 word0->link = link->link;
2497 }
2498 }
2499 }
2500 }
2501
2502 // rot = 1
2503 if (pools[1]->minBaseIdx <= pools[1]->maxBaseIdx) {
2504 startBaseIdx = pools[1]->getBaseIdx(link->xMin);
2505 endBaseIdx = pools[1]->getBaseIdx(link->xMax);
2506 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2507 for (word0 = pools[1]->getPool(j); word0; word0 = word0->next) {
2508 if (link->yMin < word0->yMin + hyperlinkSlack &&
2509 word0->yMax - hyperlinkSlack < link->yMax &&
2510 link->xMin < word0->xMin + hyperlinkSlack &&
2511 word0->xMax - hyperlinkSlack < link->xMax) {
2512 word0->link = link->link;
2513 }
2514 }
2515 }
2516 }
2517
2518 // rot = 3
2519 if (pools[3]->minBaseIdx <= pools[3]->maxBaseIdx) {
2520 startBaseIdx = pools[3]->getBaseIdx(link->xMin);
2521 endBaseIdx = pools[3]->getBaseIdx(link->xMax);
2522 for (j = startBaseIdx; j <= endBaseIdx; ++j) {
2523 for (word0 = pools[3]->getPool(j); word0; word0 = word0->next) {
2524 if (link->yMin < word0->yMin + hyperlinkSlack &&
2525 word0->yMax - hyperlinkSlack < link->yMax &&
2526 link->xMin < word0->xMin + hyperlinkSlack &&
2527 word0->xMax - hyperlinkSlack < link->xMax) {
2528 word0->link = link->link;
2529 }
2530 }
2531 }
2532 }
2533 }
2534 }
2535
2536 //----- assemble the blocks
2537
2538 //~ add an outer loop for writing mode (vertical text)
2539
2540 // build blocks for each rotation value
2541 for (rot = 0; rot < 4; ++rot) {
2542 pool = pools[rot];
2543 poolMinBaseIdx = pool->minBaseIdx;
2544 count[rot] = 0;
2545
2546 // add blocks until no more words are left
2547 while (1) {
2548
2549 // find the first non-empty line in the pool
2550 for (;
2551 poolMinBaseIdx <= pool->maxBaseIdx &&
2552 !pool->getPool(poolMinBaseIdx);
2553 ++poolMinBaseIdx) ;
2554 if (poolMinBaseIdx > pool->maxBaseIdx) {
2555 break;
2556 }
2557
2558 // look for the left-most word in the first four lines of the
2559 // pool -- this avoids starting with a superscript word
2560 startBaseIdx = poolMinBaseIdx;
2561 for (baseIdx = poolMinBaseIdx + 1;
2562 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
2563 ++baseIdx) {
2564 if (!pool->getPool(baseIdx)) {
2565 continue;
2566 }
2567 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
2568 < 0) {
2569 startBaseIdx = baseIdx;
2570 }
2571 }
2572
2573 // create a new block
2574 word0 = pool->getPool(startBaseIdx);
2575 pool->setPool(startBaseIdx, word0->next);
2576 word0->next = NULL;
2577 blk = new TextBlock(this, rot);
2578 blk->addWord(word0);
2579
2580 fontSize = word0->fontSize;
2581 minBase = maxBase = word0->base;
2582 colSpace1 = minColSpacing1 * fontSize;
2583 colSpace2 = minColSpacing2 * fontSize;
2584 lineSpace = maxLineSpacingDelta * fontSize;
2585 intraLineSpace = maxIntraLineDelta * fontSize;
2586
2587 // add words to the block
2588 do {
2589 found = gFalse;
2590
2591 // look for words on the line above the current top edge of
2592 // the block
2593 newMinBase = minBase;
2594 for (baseIdx = pool->getBaseIdx(minBase);
2595 baseIdx >= pool->getBaseIdx(minBase - lineSpace);
2596 --baseIdx) {
2597 word0 = NULL;
2598 word1 = pool->getPool(baseIdx);
2599 while (word1) {
2600 if (word1->base < minBase &&
2601 word1->base >= minBase - lineSpace &&
2602 ((rot == 0 || rot == 2)
2603 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2604 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2605 fabs(word1->fontSize - fontSize) <
2606 maxBlockFontSizeDelta1 * fontSize) {
2607 word2 = word1;
2608 if (word0) {
2609 word0->next = word1->next;
2610 } else {
2611 pool->setPool(baseIdx, word1->next);
2612 }
2613 word1 = word1->next;
2614 word2->next = NULL;
2615 blk->addWord(word2);
2616 found = gTrue;
2617 newMinBase = word2->base;
2618 } else {
2619 word0 = word1;
2620 word1 = word1->next;
2621 }
2622 }
2623 }
2624 minBase = newMinBase;
2625
2626 // look for words on the line below the current bottom edge of
2627 // the block
2628 newMaxBase = maxBase;
2629 for (baseIdx = pool->getBaseIdx(maxBase);
2630 baseIdx <= pool->getBaseIdx(maxBase + lineSpace);
2631 ++baseIdx) {
2632 word0 = NULL;
2633 word1 = pool->getPool(baseIdx);
2634 while (word1) {
2635 if (word1->base > maxBase &&
2636 word1->base <= maxBase + lineSpace &&
2637 ((rot == 0 || rot == 2)
2638 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2639 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2640 fabs(word1->fontSize - fontSize) <
2641 maxBlockFontSizeDelta1 * fontSize) {
2642 word2 = word1;
2643 if (word0) {
2644 word0->next = word1->next;
2645 } else {
2646 pool->setPool(baseIdx, word1->next);
2647 }
2648 word1 = word1->next;
2649 word2->next = NULL;
2650 blk->addWord(word2);
2651 found = gTrue;
2652 newMaxBase = word2->base;
2653 } else {
2654 word0 = word1;
2655 word1 = word1->next;
2656 }
2657 }
2658 }
2659 maxBase = newMaxBase;
2660
2661 // look for words that are on lines already in the block, and
2662 // that overlap the block horizontally
2663 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2664 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2665 ++baseIdx) {
2666 word0 = NULL;
2667 word1 = pool->getPool(baseIdx);
2668 while (word1) {
2669 if (word1->base >= minBase - intraLineSpace &&
2670 word1->base <= maxBase + intraLineSpace &&
2671 ((rot == 0 || rot == 2)
2672 ? (word1->xMin < blk->xMax + colSpace1 &&
2673 word1->xMax > blk->xMin - colSpace1)
2674 : (word1->yMin < blk->yMax + colSpace1 &&
2675 word1->yMax > blk->yMin - colSpace1)) &&
2676 fabs(word1->fontSize - fontSize) <
2677 maxBlockFontSizeDelta2 * fontSize) {
2678 word2 = word1;
2679 if (word0) {
2680 word0->next = word1->next;
2681 } else {
2682 pool->setPool(baseIdx, word1->next);
2683 }
2684 word1 = word1->next;
2685 word2->next = NULL;
2686 blk->addWord(word2);
2687 found = gTrue;
2688 } else {
2689 word0 = word1;
2690 word1 = word1->next;
2691 }
2692 }
2693 }
2694
2695 // only check for outlying words (the next two chunks of code)
2696 // if we didn't find anything else
2697 if (found) {
2698 continue;
2699 }
2700
2701 // scan down the left side of the block, looking for words
2702 // that are near (but not overlapping) the block; if there are
2703 // three or fewer, add them to the block
2704 n = 0;
2705 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2706 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2707 ++baseIdx) {
2708 word1 = pool->getPool(baseIdx);
2709 while (word1) {
2710 if (word1->base >= minBase - intraLineSpace &&
2711 word1->base <= maxBase + intraLineSpace &&
2712 ((rot == 0 || rot == 2)
2713 ? (word1->xMax <= blk->xMin &&
2714 word1->xMax > blk->xMin - colSpace2)
2715 : (word1->yMax <= blk->yMin &&
2716 word1->yMax > blk->yMin - colSpace2)) &&
2717 fabs(word1->fontSize - fontSize) <
2718 maxBlockFontSizeDelta3 * fontSize) {
2719 ++n;
2720 break;
2721 }
2722 word1 = word1->next;
2723 }
2724 }
2725 if (n > 0 && n <= 3) {
2726 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2727 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2728 ++baseIdx) {
2729 word0 = NULL;
2730 word1 = pool->getPool(baseIdx);
2731 while (word1) {
2732 if (word1->base >= minBase - intraLineSpace &&
2733 word1->base <= maxBase + intraLineSpace &&
2734 ((rot == 0 || rot == 2)
2735 ? (word1->xMax <= blk->xMin &&
2736 word1->xMax > blk->xMin - colSpace2)
2737 : (word1->yMax <= blk->yMin &&
2738 word1->yMax > blk->yMin - colSpace2)) &&
2739 fabs(word1->fontSize - fontSize) <
2740 maxBlockFontSizeDelta3 * fontSize) {
2741 word2 = word1;
2742 if (word0) {
2743 word0->next = word1->next;
2744 } else {
2745 pool->setPool(baseIdx, word1->next);
2746 }
2747 word1 = word1->next;
2748 word2->next = NULL;
2749 blk->addWord(word2);
2750 if (word2->base < minBase) {
2751 minBase = word2->base;
2752 } else if (word2->base > maxBase) {
2753 maxBase = word2->base;
2754 }
2755 found = gTrue;
2756 break;
2757 } else {
2758 word0 = word1;
2759 word1 = word1->next;
2760 }
2761 }
2762 }
2763 }
2764
2765 // scan down the right side of the block, looking for words
2766 // that are near (but not overlapping) the block; if there are
2767 // three or fewer, add them to the block
2768 n = 0;
2769 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2770 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2771 ++baseIdx) {
2772 word1 = pool->getPool(baseIdx);
2773 while (word1) {
2774 if (word1->base >= minBase - intraLineSpace &&
2775 word1->base <= maxBase + intraLineSpace &&
2776 ((rot == 0 || rot == 2)
2777 ? (word1->xMin >= blk->xMax &&
2778 word1->xMin < blk->xMax + colSpace2)
2779 : (word1->yMin >= blk->yMax &&
2780 word1->yMin < blk->yMax + colSpace2)) &&
2781 fabs(word1->fontSize - fontSize) <
2782 maxBlockFontSizeDelta3 * fontSize) {
2783 ++n;
2784 break;
2785 }
2786 word1 = word1->next;
2787 }
2788 }
2789 if (n > 0 && n <= 3) {
2790 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2791 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2792 ++baseIdx) {
2793 word0 = NULL;
2794 word1 = pool->getPool(baseIdx);
2795 while (word1) {
2796 if (word1->base >= minBase - intraLineSpace &&
2797 word1->base <= maxBase + intraLineSpace &&
2798 ((rot == 0 || rot == 2)
2799 ? (word1->xMin >= blk->xMax &&
2800 word1->xMin < blk->xMax + colSpace2)
2801 : (word1->yMin >= blk->yMax &&
2802 word1->yMin < blk->yMax + colSpace2)) &&
2803 fabs(word1->fontSize - fontSize) <
2804 maxBlockFontSizeDelta3 * fontSize) {
2805 word2 = word1;
2806 if (word0) {
2807 word0->next = word1->next;
2808 } else {
2809 pool->setPool(baseIdx, word1->next);
2810 }
2811 word1 = word1->next;
2812 word2->next = NULL;
2813 blk->addWord(word2);
2814 if (word2->base < minBase) {
2815 minBase = word2->base;
2816 } else if (word2->base > maxBase) {
2817 maxBase = word2->base;
2818 }
2819 found = gTrue;
2820 break;
2821 } else {
2822 word0 = word1;
2823 word1 = word1->next;
2824 }
2825 }
2826 }
2827 }
2828
2829 } while (found);
2830
2831 //~ need to compute the primary writing mode (horiz/vert) in
2832 //~ addition to primary rotation
2833
2834 // coalesce the block, and add it to the list
2835 blk->coalesce(uMap);
2836 if (lastBlk) {
2837 lastBlk->next = blk;
2838 } else {
2839 blkList = blk;
2840 }
2841 lastBlk = blk;
2842 count[rot] += blk->charCount;
2843 if (primaryRot < 0 || count[rot] > count[primaryRot]) {
2844 primaryRot = rot;
2845 }
2846 ++nBlocks;
2847 }
2848 }
2849
2850 #if 0 // for debugging
2851 printf("*** rotation ***\n");
2852 for (rot = 0; rot < 4; ++rot) {
2853 printf(" %d: %6d\n", rot, count[rot]);
2854 }
2855 printf(" primary rot = %d\n", primaryRot);
2856 printf("\n");
2857 #endif
2858
2859 #if 0 // for debugging
2860 printf("*** blocks ***\n");
2861 for (blk = blkList; blk; blk = blk->next) {
2862 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
2863 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
2864 for (line = blk->lines; line; line = line->next) {
2865 printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
2866 line->xMin, line->xMax, line->yMin, line->yMax, line->base);
2867 for (word0 = line->words; word0; word0 = word0->next) {
2868 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2869 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2870 word0->base, word0->fontSize, word0->spaceAfter);
2871 for (i = 0; i < word0->len; ++i) {
2872 fputc(word0->text[i] & 0xff, stdout);
2873 }
2874 printf("'\n");
2875 }
2876 }
2877 }
2878 printf("\n");
2879 #endif
2880
2881 // determine the primary direction
2882 lrCount = 0;
2883 for (blk = blkList; blk; blk = blk->next) {
2884 for (line = blk->lines; line; line = line->next) {
2885 for (word0 = line->words; word0; word0 = word0->next) {
2886 for (i = 0; i < word0->len; ++i) {
2887 if (unicodeTypeL(word0->text[i])) {
2888 ++lrCount;
2889 } else if (unicodeTypeR(word0->text[i])) {
2890 --lrCount;
2891 }
2892 }
2893 }
2894 }
2895 }
2896 primaryLR = lrCount >= 0;
2897
2898 #if 0 // for debugging
2899 printf("*** direction ***\n");
2900 printf("lrCount = %d\n", lrCount);
2901 printf("primaryLR = %d\n", primaryLR);
2902 #endif
2903
2904 //----- column assignment
2905
2906 // sort blocks into xy order for column assignment
2907 if (blocks)
2908 gfree (blocks);
2909 blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
2910 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
2911 blocks[i] = blk;
2912 }
2913 qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
2914
2915 // column assignment
2916 for (i = 0; i < nBlocks; ++i) {
2917 blk0 = blocks[i];
2918 col1 = 0;
2919 for (j = 0; j < i; ++j) {
2920 blk1 = blocks[j];
2921 col2 = 0; // make gcc happy
2922 switch (primaryRot) {
2923 case 0:
2924 if (blk0->xMin > blk1->xMax) {
2925 col2 = blk1->col + blk1->nColumns + 3;
2926 } else if (blk1->xMax == blk1->xMin) {
2927 col2 = blk1->col;
2928 } else {
2929 col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
2930 (blk1->xMax - blk1->xMin)) *
2931 blk1->nColumns);
2932 }
2933 break;
2934 case 1:
2935 if (blk0->yMin > blk1->yMax) {
2936 col2 = blk1->col + blk1->nColumns + 3;
2937 } else if (blk1->yMax == blk1->yMin) {
2938 col2 = blk1->col;
2939 } else {
2940 col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
2941 (blk1->yMax - blk1->yMin)) *
2942 blk1->nColumns);
2943 }
2944 break;
2945 case 2:
2946 if (blk0->xMax < blk1->xMin) {
2947 col2 = blk1->col + blk1->nColumns + 3;
2948 } else if (blk1->xMin == blk1->xMax) {
2949 col2 = blk1->col;
2950 } else {
2951 col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
2952 (blk1->xMin - blk1->xMax)) *
2953 blk1->nColumns);
2954 }
2955 break;
2956 case 3:
2957 if (blk0->yMax < blk1->yMin) {
2958 col2 = blk1->col + blk1->nColumns + 3;
2959 } else if (blk1->yMin == blk1->yMax) {
2960 col2 = blk1->col;
2961 } else {
2962 col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
2963 (blk1->yMin - blk1->yMax)) *
2964 blk1->nColumns);
2965 }
2966 break;
2967 }
2968 if (col2 > col1) {
2969 col1 = col2;
2970 }
2971 }
2972 blk0->col = col1;
2973 for (line = blk0->lines; line; line = line->next) {
2974 for (j = 0; j <= line->len; ++j) {
2975 line->col[j] += col1;
2976 }
2977 }
2978 }
2979
2980 #if 0 // for debugging
2981 printf("*** blocks, after column assignment ***\n");
2982 for (blk = blkList; blk; blk = blk->next) {
2983 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
2984 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
2985 blk->nColumns);
2986 for (line = blk->lines; line; line = line->next) {
2987 printf(" line:\n");
2988 for (word0 = line->words; word0; word0 = word0->next) {
2989 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2990 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2991 word0->base, word0->fontSize, word0->spaceAfter);
2992 for (i = 0; i < word0->len; ++i) {
2993 fputc(word0->text[i] & 0xff, stdout);
2994 }
2995 printf("'\n");
2996 }
2997 }
2998 }
2999 printf("\n");
3000 #endif
3001
3002 //----- reading order sort
3003
3004 // compute space on left and right sides of each block
3005 for (i = 0; i < nBlocks; ++i) {
3006 blk0 = blocks[i];
3007 for (j = 0; j < nBlocks; ++j) {
3008 blk1 = blocks[j];
3009 if (blk1 != blk0) {
3010 blk0->updatePriMinMax(blk1);
3011 }
3012 }
3013 }
3014
3015 #if 0 // for debugging
3016 printf("PAGE\n");
3017 #endif
3018
3019 int sortPos = 0;
3020 GBool *visited = (GBool *)gmallocn(nBlocks, sizeof(GBool));
3021 for (i = 0; i < nBlocks; i++) {
3022 visited[i] = gFalse;
3023 }
3024
3025 double bxMin0, byMin0, bxMin1, byMin1;
3026 int numTables = 0;
3027 int tableId = -1;
3028 int correspondenceX, correspondenceY;
3029 double xCentre1, yCentre1, xCentre2, yCentre2;
3030 double xCentre3, yCentre3, xCentre4, yCentre4;
3031 double deltaX, deltaY;
3032 TextBlock *fblk2 = NULL, *fblk3 = NULL, *fblk4 = NULL;
3033
3034 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3035 blk1->ExMin = blk1->xMin;
3036 blk1->ExMax = blk1->xMax;
3037 blk1->EyMin = blk1->yMin;
3038 blk1->EyMax = blk1->yMax;
3039
3040 bxMin0 = DBL_MAX;
3041 byMin0 = DBL_MAX;
3042 bxMin1 = DBL_MAX;
3043 byMin1 = DBL_MAX;
3044
3045 fblk2 = NULL;
3046 fblk3 = NULL;
3047 fblk4 = NULL;
3048
3049 /* find fblk2, fblk3 and fblk4 so that
3050 * fblk2 is on the right of blk1 and overlap with blk1 in y axis
3051 * fblk3 is under blk1 and overlap with blk1 in x axis
3052 * fblk4 is under blk1 and on the right of blk1
3053 * and they are closest to blk1
3054 */
3055 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3056 if (blk2 != blk1) {
3057 if (blk2->yMin <= blk1->yMax &&
3058 blk2->yMax >= blk1->yMin &&
3059 blk2->xMin > blk1->xMax &&
3060 blk2->xMin < bxMin0) {
3061 bxMin0 = blk2->xMin;
3062 fblk2 = blk2;
3063 } else if (blk2->xMin <= blk1->xMax &&
3064 blk2->xMax >= blk1->xMin &&
3065 blk2->yMin > blk1->yMax &&
3066 blk2->yMin < byMin0) {
3067 byMin0 = blk2->yMin;
3068 fblk3 = blk2;
3069 } else if (blk2->xMin > blk1->xMax &&
3070 blk2->xMin < bxMin1 &&
3071 blk2->yMin > blk1->yMax &&
3072 blk2->yMin < byMin1) {
3073 bxMin1 = blk2->xMin;
3074 byMin1 = blk2->yMin;
3075 fblk4 = blk2;
3076 }
3077 }
3078 }
3079
3080 /* fblk4 can not overlap with fblk3 in x and with fblk2 in y
3081 * fblk2 can not overlap with fblk3 in x and y
3082 * fblk4 has to overlap with fblk3 in y and with fblk2 in x
3083 */
3084 if (fblk2 != NULL &&
3085 fblk3 != NULL &&
3086 fblk4 != NULL) {
3087 if (((fblk3->xMin <= fblk4->xMax && fblk3->xMax >= fblk4->xMin) ||
3088 (fblk2->yMin <= fblk4->yMax && fblk2->yMax >= fblk4->yMin) ||
3089 (fblk2->xMin <= fblk3->xMax && fblk2->xMax >= fblk3->xMin) ||
3090 (fblk2->yMin <= fblk3->yMax && fblk2->yMax >= fblk3->yMin)) ||
3091 !(fblk4->xMin <= fblk2->xMax && fblk4->xMax >= fblk2->xMin &&
3092 fblk4->yMin <= fblk3->yMax && fblk4->yMax >= fblk3->yMin)) {
3093 fblk2 = NULL;
3094 fblk3 = NULL;
3095 fblk4 = NULL;
3096 }
3097 }
3098
3099 // if we found any then look whether they form a table
3100 if (fblk2 != NULL &&
3101 fblk3 != NULL &&
3102 fblk4 != NULL) {
3103 tableId = -1;
3104 correspondenceX = 0;
3105 correspondenceY = 0;
3106 deltaX = 0.0;
3107 deltaY = 0.0;
3108
3109 if (blk1->lines && blk1->lines->words)
3110 deltaX = blk1->lines->words->getFontSize();
3111 if (fblk2->lines && fblk2->lines->words)
3112 deltaX = deltaX < fblk2->lines->words->getFontSize() ?
3113 deltaX : fblk2->lines->words->getFontSize();
3114 if (fblk3->lines && fblk3->lines->words)
3115 deltaX = deltaX < fblk3->lines->words->getFontSize() ?
3116 deltaX : fblk3->lines->words->getFontSize();
3117 if (fblk4->lines && fblk4->lines->words)
3118 deltaX = deltaX < fblk4->lines->words->getFontSize() ?
3119 deltaX : fblk4->lines->words->getFontSize();
3120
3121 deltaY = deltaX;
3122
3123 deltaX *= minColSpacing1;
3124 deltaY *= maxIntraLineDelta;
3125
3126 xCentre1 = (blk1->xMax + blk1->xMin) / 2.0;
3127 yCentre1 = (blk1->yMax + blk1->yMin) / 2.0;
3128 xCentre2 = (fblk2->xMax + fblk2->xMin) / 2.0;
3129 yCentre2 = (fblk2->yMax + fblk2->yMin) / 2.0;
3130 xCentre3 = (fblk3->xMax + fblk3->xMin) / 2.0;
3131 yCentre3 = (fblk3->yMax + fblk3->yMin) / 2.0;
3132 xCentre4 = (fblk4->xMax + fblk4->xMin) / 2.0;
3133 yCentre4 = (fblk4->yMax + fblk4->yMin) / 2.0;
3134
3135 // are blocks centrally aligned in x ?
3136 if (fabs (xCentre1 - xCentre3) <= deltaX &&
3137 fabs (xCentre2 - xCentre4) <= deltaX)
3138 correspondenceX++;
3139
3140 // are blocks centrally aligned in y ?
3141 if (fabs (yCentre1 - yCentre2) <= deltaY &&
3142 fabs (yCentre3 - yCentre4) <= deltaY)
3143 correspondenceY++;
3144
3145 // are blocks aligned to the left ?
3146 if (fabs (blk1->xMin - fblk3->xMin) <= deltaX &&
3147 fabs (fblk2->xMin - fblk4->xMin) <= deltaX)
3148 correspondenceX++;
3149
3150 // are blocks aligned to the right ?
3151 if (fabs (blk1->xMax - fblk3->xMax) <= deltaX &&
3152 fabs (fblk2->xMax - fblk4->xMax) <= deltaX)
3153 correspondenceX++;
3154
3155 // are blocks aligned to the top ?
3156 if (fabs (blk1->yMin - fblk2->yMin) <= deltaY &&
3157 fabs (fblk3->yMin - fblk4->yMin) <= deltaY)
3158 correspondenceY++;
3159
3160 // are blocks aligned to the bottom ?
3161 if (fabs (blk1->yMax - fblk2->yMax) <= deltaY &&
3162 fabs (fblk3->yMax - fblk4->yMax) <= deltaY)
3163 correspondenceY++;
3164
3165 // are blocks aligned in x and y ?
3166 if (correspondenceX > 0 &&
3167 correspondenceY > 0) {
3168
3169 // find maximal tableId
3170 tableId = tableId < fblk4->tableId ? fblk4->tableId : tableId;
3171 tableId = tableId < fblk3->tableId ? fblk3->tableId : tableId;
3172 tableId = tableId < fblk2->tableId ? fblk2->tableId : tableId;
3173 tableId = tableId < blk1->tableId ? blk1->tableId : tableId;
3174
3175 // if the tableId is -1, then we found new table
3176 if (tableId < 0) {
3177 tableId = numTables;
3178 numTables++;
3179 }
3180
3181 blk1->tableId = tableId;
3182 fblk2->tableId = tableId;
3183 fblk3->tableId = tableId;
3184 fblk4->tableId = tableId;
3185 }
3186 }
3187 }
3188
3189 /* set extended bounding boxes of all table entries
3190 * so that they contain whole table
3191 * (we need to process whole table size when comparing it
3192 * with regular text blocks)
3193 */
3194 PDFRectangle *envelopes = new PDFRectangle [numTables];
3195 TextBlock **ending_blocks = new TextBlock* [numTables];
3196
3197 for (i = 0; i < numTables; i++) {
3198 envelopes[i].x1 = DBL_MAX;
3199 envelopes[i].x2 = DBL_MIN;
3200 envelopes[i].y1 = DBL_MAX;
3201 envelopes[i].y2 = DBL_MIN;
3202 }
3203
3204 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3205 if (blk1->tableId >= 0) {
3206 if (blk1->ExMin < envelopes[blk1->tableId].x1) {
3207 envelopes[blk1->tableId].x1 = blk1->ExMin;
3208 if (!blk1->page->primaryLR)
3209 ending_blocks[blk1->tableId] = blk1;
3210 }
3211
3212 if (blk1->ExMax > envelopes[blk1->tableId].x2) {
3213 envelopes[blk1->tableId].x2 = blk1->ExMax;
3214 if (blk1->page->primaryLR)
3215 ending_blocks[blk1->tableId] = blk1;
3216 }
3217
3218 envelopes[blk1->tableId].y1 = blk1->EyMin < envelopes[blk1->tableId].y1 ?
3219 blk1->EyMin : envelopes[blk1->tableId].y1;
3220 envelopes[blk1->tableId].y2 = blk1->EyMax > envelopes[blk1->tableId].y2 ?
3221 blk1->EyMax : envelopes[blk1->tableId].y2;
3222 }
3223 }
3224
3225 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3226 if (blk1->tableId >= 0 &&
3227 blk1->xMin <= ending_blocks[blk1->tableId]->xMax &&
3228 blk1->xMax >= ending_blocks[blk1->tableId]->xMin) {
3229 blk1->tableEnd = gTrue;
3230 }
3231 }
3232
3233 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3234 if (blk1->tableId >= 0) {
3235 blk1->ExMin = envelopes[blk1->tableId].x1;
3236 blk1->ExMax = envelopes[blk1->tableId].x2;
3237 blk1->EyMin = envelopes[blk1->tableId].y1;
3238 blk1->EyMax = envelopes[blk1->tableId].y2;
3239 }
3240 }
3241 delete[] envelopes;
3242 delete[] ending_blocks;
3243
3244
3245 /* set extended bounding boxes of all other blocks
3246 * so that they extend in x without hitting neighbours
3247 */
3248 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3249 if (!blk1->tableId >= 0) {
3250 double xMax = DBL_MAX;
3251 double xMin = DBL_MIN;
3252
3253 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3254 if (blk2 == blk1)
3255 continue;
3256
3257 if (blk1->yMin <= blk2->yMax && blk1->yMax >= blk2->yMin) {
3258 if (blk2->xMin < xMax && blk2->xMin > blk1->xMax)
3259 xMax = blk2->xMin;
3260
3261 if (blk2->xMax > xMin && blk2->xMax < blk1->xMin)
3262 xMin = blk2->xMax;
3263 }
3264 }
3265
3266 for (blk2 = blkList; blk2; blk2 = blk2->next) {
3267 if (blk2 == blk1)
3268 continue;
3269
3270 if (blk2->xMax > blk1->ExMax &&
3271 blk2->xMax <= xMax &&
3272 blk2->yMin >= blk1->yMax) {
3273 blk1->ExMax = blk2->xMax;
3274 }
3275
3276 if (blk2->xMin < blk1->ExMin &&
3277 blk2->xMin >= xMin &&
3278 blk2->yMin >= blk1->yMax)
3279 blk1->ExMin = blk2->xMin;
3280 }
3281 }
3282 }
3283
3284 i = -1;
3285 for (blk1 = blkList; blk1; blk1 = blk1->next) {
3286 i++;
3287 sortPos = blk1->visitDepthFirst(blkList, i, blocks, sortPos, visited);
3288 }
3289 if (visited) {
3290 gfree(visited);
3291 }
3292
3293 #if 0 // for debugging
3294 printf("*** blocks, after ro sort ***\n");
3295 for (i = 0; i < nBlocks; ++i) {
3296 blk = blocks[i];
3297 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
3298 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
3299 blk->priMin, blk->priMax);
3300 for (line = blk->lines; line; line = line->next) {
3301 printf(" line:\n");
3302 for (word0 = line->words; word0; word0 = word0->next) {
3303 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3304 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3305 word0->base, word0->fontSize, word0->spaceAfter);
3306 for (j = 0; j < word0->len; ++j) {
3307 fputc(word0->text[j] & 0xff, stdout);
3308 }
3309 printf("'\n");
3310 }
3311 }
3312 }
3313 printf("\n");
3314 fflush(stdout);
3315 #endif
3316
3317 // build the flows
3318 //~ this needs to be adjusted for writing mode (vertical text)
3319 //~ this also needs to account for right-to-left column ordering
3320 flow = NULL;
3321 while (flows) {
3322 flow = flows;
3323 flows = flows->next;
3324 delete flow;
3325 }
3326 flows = lastFlow = NULL;
3327 // assume blocks are already in reading order,
3328 // and construct flows accordingly.
3329 for (i = 0; i < nBlocks; i++) {
3330 blk = blocks[i];
3331 blk->next = NULL;
3332 if (flow) {
3333 blk1 = blocks[i - 1];
3334 blkSpace = maxBlockSpacing * blk1->lines->words->fontSize;
3335 if (blk1->secondaryDelta(blk) <= blkSpace &&
3336 blk->isBelow(blk1) &&
3337 flow->blockFits(blk, blk1)) {
3338 flow->addBlock(blk);
3339 continue;
3340 }
3341 }
3342 flow = new TextFlow(this, blk);
3343 if (lastFlow) {
3344 lastFlow->next = flow;
3345 } else {
3346 flows = flow;
3347 }
3348 lastFlow = flow;
3349 }
3350
3351 #if 0 // for debugging
3352 printf("*** flows ***\n");
3353 for (flow = flows; flow; flow = flow->next) {
3354 printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
3355 flow->xMin, flow->xMax, flow->yMin, flow->yMax,
3356 flow->priMin, flow->priMax);
3357 for (blk = flow->blocks; blk; blk = blk->next) {
3358 printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
3359 blk->rot, blk->ExMin, blk->ExMax, blk->EyMin, blk->EyMax,
3360 blk->priMin, blk->priMax);
3361 for (line = blk->lines; line; line = line->next) {
3362 printf(" line:\n");
3363 for (word0 = line->words; word0; word0 = word0->next) {
3364 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
3365 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
3366 word0->base, word0->fontSize, word0->spaceAfter);
3367 for (i = 0; i < word0->len; ++i) {
3368 fputc(word0->text[i] & 0xff, stdout);
3369 }
3370 printf("'\n");
3371 }
3372 }
3373 }
3374 }
3375 printf("\n");
3376 #endif
3377
3378 if (uMap) {
3379 uMap->decRefCnt();
3380 }
3381 }
3382
findText(Unicode * s,int len,GBool startAtTop,GBool stopAtBottom,GBool startAtLast,GBool stopAtLast,GBool caseSensitive,GBool backward,double * xMin,double * yMin,double * xMax,double * yMax)3383 GBool TextPage::findText(Unicode *s, int len,
3384 GBool startAtTop, GBool stopAtBottom,
3385 GBool startAtLast, GBool stopAtLast,
3386 GBool caseSensitive, GBool backward,
3387 double *xMin, double *yMin,
3388 double *xMax, double *yMax) {
3389 TextBlock *blk;
3390 TextLine *line;
3391 Unicode *s2, *txt;
3392 Unicode *p;
3393 int txtSize, m, i, j, k;
3394 double xStart, yStart, xStop, yStop;
3395 double xMin0, yMin0, xMax0, yMax0;
3396 double xMin1, yMin1, xMax1, yMax1;
3397 GBool found;
3398
3399 //~ needs to handle right-to-left text
3400
3401 if (rawOrder) {
3402 return gFalse;
3403 }
3404
3405 // convert the search string to uppercase
3406 if (!caseSensitive) {
3407 s2 = unicodeNormalizeNFKC(s, len, &len, NULL);
3408 for (i = 0; i < len; ++i) {
3409 s2[i] = unicodeToUpper(s2[i]);
3410 }
3411 } else {
3412 s2 = unicodeNormalizeNFKC(s, len, &len, NULL);
3413 }
3414
3415 txt = NULL;
3416 txtSize = 0;
3417
3418 xStart = yStart = xStop = yStop = 0;
3419 if (startAtLast && haveLastFind) {
3420 xStart = lastFindXMin;
3421 yStart = lastFindYMin;
3422 } else if (!startAtTop) {
3423 xStart = *xMin;
3424 yStart = *yMin;
3425 }
3426 if (stopAtLast && haveLastFind) {
3427 xStop = lastFindXMin;
3428 yStop = lastFindYMin;
3429 } else if (!stopAtBottom) {
3430 xStop = *xMax;
3431 yStop = *yMax;
3432 }
3433
3434 found = gFalse;
3435 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3436 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3437
3438 for (i = backward ? nBlocks - 1 : 0;
3439 backward ? i >= 0 : i < nBlocks;
3440 i += backward ? -1 : 1) {
3441 blk = blocks[i];
3442
3443 // check: is the block above the top limit?
3444 if (!startAtTop && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
3445 continue;
3446 }
3447
3448 // check: is the block below the bottom limit?
3449 if (!stopAtBottom && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
3450 break;
3451 }
3452
3453 for (line = blk->lines; line; line = line->next) {
3454
3455 // check: is the line above the top limit?
3456 if (!startAtTop &&
3457 (backward ? line->yMin > yStart : line->yMin < yStart)) {
3458 continue;
3459 }
3460
3461 // check: is the line below the bottom limit?
3462 if (!stopAtBottom &&
3463 (backward ? line->yMin < yStop : line->yMin > yStop)) {
3464 continue;
3465 }
3466
3467 if (!line->normalized)
3468 line->normalized = unicodeNormalizeNFKC(line->text, line->len,
3469 &line->normalized_len,
3470 &line->normalized_idx);
3471 // convert the line to uppercase
3472 m = line->normalized_len;
3473 if (!caseSensitive) {
3474 if (m > txtSize) {
3475 txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
3476 txtSize = m;
3477 }
3478 for (k = 0; k < m; ++k) {
3479 txt[k] = unicodeToUpper(line->normalized[k]);
3480 }
3481 } else {
3482 txt = line->normalized;
3483 }
3484
3485 // search each position in this line
3486 j = backward ? m - len : 0;
3487 p = txt + j;
3488 while (backward ? j >= 0 : j <= m - len) {
3489
3490 // compare the strings
3491 for (k = 0; k < len; ++k) {
3492 if (p[k] != s2[k]) {
3493 break;
3494 }
3495 }
3496
3497 // found it
3498 if (k == len) {
3499 // where s2 matches a subsequence of a compatibility equivalence
3500 // decomposition, highlight the entire glyph, since we don't know
3501 // the internal layout of subglyph components
3502 int normStart = line->normalized_idx[j];
3503 int normAfterEnd = line->normalized_idx[j + len - 1] + 1;
3504 switch (line->rot) {
3505 case 0:
3506 xMin1 = line->edge[normStart];
3507 xMax1 = line->edge[normAfterEnd];
3508 yMin1 = line->yMin;
3509 yMax1 = line->yMax;
3510 break;
3511 case 1:
3512 xMin1 = line->xMin;
3513 xMax1 = line->xMax;
3514 yMin1 = line->edge[normStart];
3515 yMax1 = line->edge[normAfterEnd];
3516 break;
3517 case 2:
3518 xMin1 = line->edge[normAfterEnd];
3519 xMax1 = line->edge[normStart];
3520 yMin1 = line->yMin;
3521 yMax1 = line->yMax;
3522 break;
3523 case 3:
3524 xMin1 = line->xMin;
3525 xMax1 = line->xMax;
3526 yMin1 = line->edge[normAfterEnd];
3527 yMax1 = line->edge[normStart];
3528 break;
3529 }
3530 if (backward) {
3531 if ((startAtTop ||
3532 yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) &&
3533 (stopAtBottom ||
3534 yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
3535 if (!found ||
3536 yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
3537 xMin0 = xMin1;
3538 xMax0 = xMax1;
3539 yMin0 = yMin1;
3540 yMax0 = yMax1;
3541 found = gTrue;
3542 }
3543 }
3544 } else {
3545 if ((startAtTop ||
3546 yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
3547 (stopAtBottom ||
3548 yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
3549 if (!found ||
3550 yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
3551 xMin0 = xMin1;
3552 xMax0 = xMax1;
3553 yMin0 = yMin1;
3554 yMax0 = yMax1;
3555 found = gTrue;
3556 }
3557 }
3558 }
3559 }
3560 if (backward) {
3561 --j;
3562 --p;
3563 } else {
3564 ++j;
3565 ++p;
3566 }
3567 }
3568 }
3569 }
3570
3571 gfree(s2);
3572 if (!caseSensitive) {
3573 gfree(txt);
3574 }
3575
3576 if (found) {
3577 *xMin = xMin0;
3578 *xMax = xMax0;
3579 *yMin = yMin0;
3580 *yMax = yMax0;
3581 lastFindXMin = xMin0;
3582 lastFindYMin = yMin0;
3583 haveLastFind = gTrue;
3584 return gTrue;
3585 }
3586
3587 return gFalse;
3588 }
3589
getText(double xMin,double yMin,double xMax,double yMax)3590 GooString *TextPage::getText(double xMin, double yMin,
3591 double xMax, double yMax) {
3592 GooString *s;
3593 UnicodeMap *uMap;
3594 GBool isUnicode;
3595 TextBlock *blk;
3596 TextLine *line;
3597 TextLineFrag *frags;
3598 int nFrags, fragsSize;
3599 TextLineFrag *frag;
3600 char space[8], eol[16];
3601 int spaceLen, eolLen;
3602 int lastRot;
3603 double x, y, delta;
3604 int col, idx0, idx1, i, j;
3605 GBool multiLine, oneRot;
3606
3607 s = new GooString();
3608
3609 // get the output encoding
3610 if (!(uMap = globalParams->getTextEncoding())) {
3611 return s;
3612 }
3613
3614 if (rawOrder) {
3615 TextWord* word;
3616 char mbc[16];
3617 int mbc_len;
3618
3619 for (word = rawWords; word && word <= rawLastWord; word = word->next) {
3620 for (j = 0; j < word->getLength(); ++j) {
3621 double gXMin, gXMax, gYMin, gYMax;
3622 word->getCharBBox(j, &gXMin, &gYMin, &gXMax, &gYMax);
3623 if (xMin <= gXMin && gXMax <= xMax && yMin <= gYMin && gYMax <= yMax)
3624 {
3625 mbc_len = uMap->mapUnicode( *(word->getChar(j)), mbc, sizeof(mbc) );
3626 s->append(mbc, mbc_len);
3627 }
3628 }
3629 }
3630 return s;
3631 }
3632
3633 isUnicode = uMap->isUnicode();
3634 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3635 eolLen = 0; // make gcc happy
3636 switch (globalParams->getTextEOL()) {
3637 case eolUnix:
3638 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3639 break;
3640 case eolDOS:
3641 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3642 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3643 break;
3644 case eolMac:
3645 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3646 break;
3647 }
3648
3649 //~ writing mode (horiz/vert)
3650
3651 // collect the line fragments that are in the rectangle
3652 fragsSize = 256;
3653 frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
3654 nFrags = 0;
3655 lastRot = -1;
3656 oneRot = gTrue;
3657 for (i = 0; i < nBlocks; ++i) {
3658 blk = blocks[i];
3659 if (xMin < blk->xMax && blk->xMin < xMax &&
3660 yMin < blk->yMax && blk->yMin < yMax) {
3661 for (line = blk->lines; line; line = line->next) {
3662 if (xMin < line->xMax && line->xMin < xMax &&
3663 yMin < line->yMax && line->yMin < yMax) {
3664 idx0 = idx1 = -1;
3665 switch (line->rot) {
3666 case 0:
3667 y = 0.5 * (line->yMin + line->yMax);
3668 if (yMin < y && y < yMax) {
3669 j = 0;
3670 while (j < line->len) {
3671 if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
3672 idx0 = j;
3673 break;
3674 }
3675 ++j;
3676 }
3677 j = line->len - 1;
3678 while (j >= 0) {
3679 if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
3680 idx1 = j;
3681 break;
3682 }
3683 --j;
3684 }
3685 }
3686 break;
3687 case 1:
3688 x = 0.5 * (line->xMin + line->xMax);
3689 if (xMin < x && x < xMax) {
3690 j = 0;
3691 while (j < line->len) {
3692 if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
3693 idx0 = j;
3694 break;
3695 }
3696 ++j;
3697 }
3698 j = line->len - 1;
3699 while (j >= 0) {
3700 if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
3701 idx1 = j;
3702 break;
3703 }
3704 --j;
3705 }
3706 }
3707 break;
3708 case 2:
3709 y = 0.5 * (line->yMin + line->yMax);
3710 if (yMin < y && y < yMax) {
3711 j = 0;
3712 while (j < line->len) {
3713 if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
3714 idx0 = j;
3715 break;
3716 }
3717 ++j;
3718 }
3719 j = line->len - 1;
3720 while (j >= 0) {
3721 if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
3722 idx1 = j;
3723 break;
3724 }
3725 --j;
3726 }
3727 }
3728 break;
3729 case 3:
3730 x = 0.5 * (line->xMin + line->xMax);
3731 if (xMin < x && x < xMax) {
3732 j = 0;
3733 while (j < line->len) {
3734 if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
3735 idx0 = j;
3736 break;
3737 }
3738 ++j;
3739 }
3740 j = line->len - 1;
3741 while (j >= 0) {
3742 if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
3743 idx1 = j;
3744 break;
3745 }
3746 --j;
3747 }
3748 }
3749 break;
3750 }
3751 if (idx0 >= 0 && idx1 >= 0) {
3752 if (nFrags == fragsSize) {
3753 fragsSize *= 2;
3754 frags = (TextLineFrag *)
3755 greallocn(frags, fragsSize, sizeof(TextLineFrag));
3756 }
3757 frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
3758 ++nFrags;
3759 if (lastRot >= 0 && line->rot != lastRot) {
3760 oneRot = gFalse;
3761 }
3762 lastRot = line->rot;
3763 }
3764 }
3765 }
3766 }
3767 }
3768
3769 // sort the fragments and generate the string
3770 if (nFrags > 0) {
3771
3772 for (i = 0; i < nFrags; ++i) {
3773 frags[i].computeCoords(oneRot);
3774 }
3775 assignColumns(frags, nFrags, oneRot);
3776
3777 // if all lines in the region have the same rotation, use it;
3778 // otherwise, use the page's primary rotation
3779 if (oneRot) {
3780 qsort(frags, nFrags, sizeof(TextLineFrag),
3781 &TextLineFrag::cmpYXLineRot);
3782 } else {
3783 qsort(frags, nFrags, sizeof(TextLineFrag),
3784 &TextLineFrag::cmpYXPrimaryRot);
3785 }
3786 i = 0;
3787 while (i < nFrags) {
3788 delta = maxIntraLineDelta * frags[i].line->words->fontSize;
3789 for (j = i+1;
3790 j < nFrags && fabs(frags[j].base - frags[i].base) < delta;
3791 ++j) ;
3792 qsort(frags + i, j - i, sizeof(TextLineFrag),
3793 oneRot ? &TextLineFrag::cmpXYColumnLineRot
3794 : &TextLineFrag::cmpXYColumnPrimaryRot);
3795 i = j;
3796 }
3797
3798 col = 0;
3799 multiLine = gFalse;
3800 for (i = 0; i < nFrags; ++i) {
3801 frag = &frags[i];
3802
3803 // insert a return
3804 if (frag->col < col ||
3805 (i > 0 && fabs(frag->base - frags[i-1].base) >
3806 maxIntraLineDelta * frags[i-1].line->words->fontSize)) {
3807 s->append(eol, eolLen);
3808 col = 0;
3809 multiLine = gTrue;
3810 }
3811
3812 // column alignment
3813 for (; col < frag->col; ++col) {
3814 s->append(space, spaceLen);
3815 }
3816
3817 // get the fragment text
3818 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3819 }
3820
3821 if (multiLine) {
3822 s->append(eol, eolLen);
3823 }
3824 }
3825
3826 gfree(frags);
3827 uMap->decRefCnt();
3828
3829 return s;
3830 }
3831
3832 class TextSelectionVisitor {
3833 public:
3834 TextSelectionVisitor (TextPage *page);
~TextSelectionVisitor()3835 virtual ~TextSelectionVisitor () { }
3836 virtual void visitBlock (TextBlock *block,
3837 TextLine *begin,
3838 TextLine *end,
3839 PDFRectangle *selection) = 0;
3840 virtual void visitLine (TextLine *line,
3841 TextWord *begin,
3842 TextWord *end,
3843 int edge_begin,
3844 int edge_end,
3845 PDFRectangle *selection) = 0;
3846 virtual void visitWord (TextWord *word, int begin, int end,
3847 PDFRectangle *selection) = 0;
3848
3849 protected:
3850 TextPage *page;
3851 };
3852
TextSelectionVisitor(TextPage * page)3853 TextSelectionVisitor::TextSelectionVisitor (TextPage *page)
3854 : page(page)
3855 {
3856 }
3857
3858
3859 class TextSelectionDumper : public TextSelectionVisitor {
3860 public:
3861 TextSelectionDumper(TextPage *page);
3862 virtual ~TextSelectionDumper();
3863
visitBlock(TextBlock * block,TextLine * begin,TextLine * end,PDFRectangle * selection)3864 virtual void visitBlock (TextBlock *block,
3865 TextLine *begin,
3866 TextLine *end,
3867 PDFRectangle *selection) { };
3868 virtual void visitLine (TextLine *line,
3869 TextWord *begin,
3870 TextWord *end,
3871 int edge_begin,
3872 int edge_end,
3873 PDFRectangle *selection);
visitWord(TextWord * word,int begin,int end,PDFRectangle * selection)3874 virtual void visitWord (TextWord *word, int begin, int end,
3875 PDFRectangle *selection) { };
3876
3877 GooString *getText(void);
3878
3879 private:
3880 TextLineFrag *frags;
3881 int nFrags, fragsSize;
3882 };
3883
TextSelectionDumper(TextPage * page)3884 TextSelectionDumper::TextSelectionDumper(TextPage *page)
3885 : TextSelectionVisitor(page)
3886 {
3887 fragsSize = 256;
3888 frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
3889 nFrags = 0;
3890 }
3891
~TextSelectionDumper()3892 TextSelectionDumper::~TextSelectionDumper()
3893 {
3894 gfree(frags);
3895 }
3896
visitLine(TextLine * line,TextWord * begin,TextWord * end,int edge_begin,int edge_end,PDFRectangle * selection)3897 void TextSelectionDumper::visitLine (TextLine *line,
3898 TextWord *begin,
3899 TextWord *end,
3900 int edge_begin,
3901 int edge_end,
3902 PDFRectangle *selection)
3903 {
3904 if (nFrags == fragsSize) {
3905 fragsSize *= 2;
3906 frags = (TextLineFrag *) grealloc(frags, fragsSize * sizeof(TextLineFrag));
3907 }
3908
3909 frags[nFrags].init(line, edge_begin, edge_end - edge_begin);
3910 ++nFrags;
3911
3912 }
3913
getText(void)3914 GooString *TextSelectionDumper::getText (void)
3915 {
3916 GooString *s;
3917 TextLineFrag *frag;
3918 int i, j;
3919 GBool multiLine;
3920 UnicodeMap *uMap;
3921 char space[8], eol[16];
3922 int spaceLen, eolLen;
3923 GooList *strings = NULL;
3924 int actual_table = -1;
3925 int actual_line = -1;
3926 int last_length = 0;
3927 TextBlock *actual_block = NULL;
3928
3929 s = new GooString();
3930
3931 uMap = globalParams->getTextEncoding();
3932
3933 if (uMap == NULL)
3934 return s;
3935
3936 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3937 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3938
3939 if (nFrags > 0) {
3940 multiLine = gFalse;
3941 for (i = 0; i < nFrags; ++i) {
3942 frag = &frags[i];
3943
3944 if (actual_table >= 0 && frag->line->blk->tableId < 0) {
3945 for (j = 0; j < strings->getLength (); j++) {
3946 s->append ((GooString*) strings->get (j));
3947 s->append (eol, eolLen);
3948 delete ((GooString*) strings->get (j));
3949 }
3950 delete strings;
3951 strings = NULL;
3952 actual_table = -1;
3953 actual_line = -1;
3954 actual_block = NULL;
3955 }
3956
3957 // a table
3958 if (frag->line->blk->tableId >= 0) {
3959 if (actual_table == -1) {
3960 strings = new GooList();
3961 actual_table = frag->line->blk->tableId;
3962 actual_block = frag->line->blk;
3963 actual_line = -1;
3964 }
3965
3966 // the same block
3967 if (actual_block == frag->line->blk) {
3968 actual_line++;
3969 if (actual_line >= strings->getLength ()) {
3970 GooString *t = new GooString ();
3971 // add some spaces to have this block correctly aligned
3972 if (actual_line > 0)
3973 for (j = 0; j < ((GooString*) (strings->get (actual_line - 1)))->getLength() - last_length - 1; j++)
3974 t->append (space, spaceLen);
3975 strings->append (t);
3976 }
3977 }
3978 // another block
3979 else {
3980 // previous block ended its row
3981 if (actual_block->tableEnd) {
3982 for (j = 0; j < strings->getLength (); j++) {
3983 s->append ((GooString*) strings->get (j));
3984 s->append (eol, eolLen);
3985 delete ((GooString*) strings->get (j));
3986 }
3987 delete strings;
3988
3989 strings = new GooList();
3990 GooString *t = new GooString ();
3991 strings->append (t);
3992 }
3993 actual_block = frag->line->blk;
3994 actual_line = 0;
3995 }
3996
3997 page->dumpFragment(frag->line->text + frag->start, frag->len, uMap, ((GooString*) strings->get (actual_line)));
3998 last_length = frag->len;
3999
4000 if (!frag->line->blk->tableEnd) {
4001 ((GooString*) strings->get (actual_line))->append (space, spaceLen);
4002 }
4003 }
4004 // not a table
4005 else {
4006 page->dumpFragment (frag->line->text + frag->start, frag->len, uMap, s);
4007 s->append (eol, eolLen);
4008 }
4009 }
4010
4011 if (strings != NULL) {
4012 for (j = 0; j < strings->getLength (); j++) {
4013 s->append((GooString*) strings->get (j));
4014 s->append(eol, eolLen);
4015 delete ((GooString*) strings->get (j));
4016 }
4017 delete strings;
4018 strings = NULL;
4019 actual_table = -1;
4020 actual_line = -1;
4021 actual_block = NULL;
4022 }
4023 }
4024
4025 uMap->decRefCnt();
4026
4027 return s;
4028 }
4029
4030 class TextSelectionSizer : public TextSelectionVisitor {
4031 public:
4032 TextSelectionSizer(TextPage *page, double scale);
~TextSelectionSizer()4033 ~TextSelectionSizer() { }
4034
visitBlock(TextBlock * block,TextLine * begin,TextLine * end,PDFRectangle * selection)4035 virtual void visitBlock (TextBlock *block,
4036 TextLine *begin,
4037 TextLine *end,
4038 PDFRectangle *selection) { };
4039 virtual void visitLine (TextLine *line,
4040 TextWord *begin,
4041 TextWord *end,
4042 int edge_begin,
4043 int edge_end,
4044 PDFRectangle *selection);
visitWord(TextWord * word,int begin,int end,PDFRectangle * selection)4045 virtual void visitWord (TextWord *word, int begin, int end,
4046 PDFRectangle *selection) { };
4047
getRegion()4048 GooList *getRegion () { return list; }
4049
4050 private:
4051 GooList *list;
4052 double scale;
4053 };
4054
TextSelectionSizer(TextPage * page,double scale)4055 TextSelectionSizer::TextSelectionSizer(TextPage *page, double scale)
4056 : TextSelectionVisitor(page),
4057 scale(scale)
4058 {
4059 list = new GooList();
4060 }
4061
visitLine(TextLine * line,TextWord * begin,TextWord * end,int edge_begin,int edge_end,PDFRectangle * selection)4062 void TextSelectionSizer::visitLine (TextLine *line,
4063 TextWord *begin,
4064 TextWord *end,
4065 int edge_begin,
4066 int edge_end,
4067 PDFRectangle *selection)
4068 {
4069 PDFRectangle *rect;
4070 double x1, y1, x2, y2, margin;
4071
4072 margin = (line->yMax - line->yMin) / 8;
4073 x1 = line->edge[edge_begin];
4074 y1 = line->yMin - margin;
4075 x2 = line->edge[edge_end];
4076 y2 = line->yMax + margin;
4077
4078 rect = new PDFRectangle (floor (x1 * scale),
4079 floor (y1 * scale),
4080 ceil (x2 * scale),
4081 ceil (y2 * scale));
4082 list->append (rect);
4083 }
4084
4085
4086 class TextSelectionPainter : public TextSelectionVisitor {
4087 public:
4088 TextSelectionPainter(TextPage *page,
4089 double scale,
4090 int rotation,
4091 OutputDev *out,
4092 GfxColor *box_color,
4093 GfxColor *glyph_color);
4094 ~TextSelectionPainter();
4095
visitBlock(TextBlock * block,TextLine * begin,TextLine * end,PDFRectangle * selection)4096 virtual void visitBlock (TextBlock *block,
4097 TextLine *begin,
4098 TextLine *end,
4099 PDFRectangle *selection) { };
4100 virtual void visitLine (TextLine *line,
4101 TextWord *begin,
4102 TextWord *end,
4103 int edge_begin,
4104 int edge_end,
4105 PDFRectangle *selection);
4106 virtual void visitWord (TextWord *word, int begin, int end,
4107 PDFRectangle *selection);
4108
4109 private:
4110 OutputDev *out;
4111 GfxColor *box_color, *glyph_color;
4112 GfxState *state;
4113 };
4114
TextSelectionPainter(TextPage * page,double scale,int rotation,OutputDev * out,GfxColor * box_color,GfxColor * glyph_color)4115 TextSelectionPainter::TextSelectionPainter(TextPage *page,
4116 double scale,
4117 int rotation,
4118 OutputDev *out,
4119 GfxColor *box_color,
4120 GfxColor *glyph_color)
4121 : TextSelectionVisitor(page),
4122 out(out),
4123 box_color(box_color),
4124 glyph_color(glyph_color)
4125 {
4126 PDFRectangle box(0, 0, page->pageWidth, page->pageHeight);
4127
4128 state = new GfxState(72 * scale, 72 * scale, &box, rotation, gFalse);
4129
4130 out->startPage (0, state);
4131 out->setDefaultCTM (state->getCTM());
4132
4133 state->setTextMat(1, 0, 0, -1, 0, 0);
4134 state->setFillColorSpace(new GfxDeviceRGBColorSpace());
4135
4136 }
4137
~TextSelectionPainter()4138 TextSelectionPainter::~TextSelectionPainter()
4139 {
4140 out->endPage ();
4141
4142 delete state;
4143 }
4144
visitLine(TextLine * line,TextWord * begin,TextWord * end,int edge_begin,int edge_end,PDFRectangle * selection)4145 void TextSelectionPainter::visitLine (TextLine *line,
4146 TextWord *begin,
4147 TextWord *end,
4148 int edge_begin,
4149 int edge_end,
4150 PDFRectangle *selection)
4151 {
4152 double x1, y1, x2, y2, margin;
4153 Matrix ctm, ictm;
4154
4155 state->setFillColor(box_color);
4156 out->updateFillColor(state);
4157
4158 margin = (line->yMax - line->yMin) / 8;
4159 x1 = floor (line->edge[edge_begin]);
4160 y1 = floor (line->yMin - margin);
4161 x2 = ceil (line->edge[edge_end]);
4162 y2 = ceil (line->yMax + margin);
4163
4164 state->getCTM (&ctm);
4165 ctm.transform(line->edge[edge_begin], line->yMin - margin, &x1, &y1);
4166 ctm.transform(line->edge[edge_end], line->yMax + margin, &x2, &y2);
4167
4168 x1 = floor (x1);
4169 y1 = floor (y1);
4170 x2 = ceil (x2);
4171 y2 = ceil (y2);
4172
4173 ctm.invertTo (&ictm);
4174 ictm.transform(x1, y1, &x1, &y1);
4175 ictm.transform(x2, y2, &x2, &y2);
4176
4177 state->moveTo(x1, y1);
4178 state->lineTo(x2, y1);
4179 state->lineTo(x2, y2);
4180 state->lineTo(x1, y2);
4181 state->closePath();
4182
4183 out->fill(state);
4184 state->clearPath();
4185 }
4186
visitWord(TextWord * word,int begin,int end,PDFRectangle * selection)4187 void TextSelectionPainter::visitWord (TextWord *word, int begin, int end,
4188 PDFRectangle *selection)
4189 {
4190 GooString *string;
4191 int i;
4192
4193 state->setFillColor(glyph_color);
4194 out->updateFillColor(state);
4195 word->font->gfxFont->incRefCnt();
4196 state->setFont(word->font->gfxFont, word->fontSize);
4197 out->updateFont(state);
4198
4199 /* The only purpose of this string is to let the output device query
4200 * it's length. Might want to change this interface later. */
4201
4202 string = new GooString ((char *) word->charcode, end - begin);
4203
4204 out->beginString(state, string);
4205
4206 for (i = begin; i < end; i++)
4207 out->drawChar(state, word->edge[i], word->base, 0, 0, 0, 0,
4208 word->charcode[i], 1, NULL, 0);
4209
4210 out->endString(state);
4211
4212 delete string;
4213 }
4214
visitSelection(TextSelectionVisitor * visitor,PDFRectangle * selection,SelectionStyle style)4215 void TextWord::visitSelection(TextSelectionVisitor *visitor,
4216 PDFRectangle *selection,
4217 SelectionStyle style)
4218 {
4219 int i, begin, end;
4220 double mid;
4221
4222 begin = len;
4223 end = 0;
4224 for (i = 0; i < len; i++) {
4225 mid = (edge[i] + edge[i + 1]) / 2;
4226 if (selection->x1 < mid || selection->x2 < mid)
4227 if (i < begin)
4228 begin = i;
4229 if (mid < selection->x1 || mid < selection->x2)
4230 end = i + 1;
4231 }
4232
4233 /* Skip empty selection. */
4234 if (end <= begin)
4235 return;
4236
4237 visitor->visitWord (this, begin, end, selection);
4238 }
4239
visitSelection(TextSelectionVisitor * visitor,PDFRectangle * selection,SelectionStyle style)4240 void TextLine::visitSelection(TextSelectionVisitor *visitor,
4241 PDFRectangle *selection,
4242 SelectionStyle style) {
4243 TextWord *p, *begin, *end, *current;
4244 int i, edge_begin, edge_end;
4245 PDFRectangle child_selection;
4246
4247 begin = NULL;
4248 end = NULL;
4249 current = NULL;
4250 for (p = words; p != NULL; p = p->next) {
4251 if (blk->page->primaryLR) {
4252 if ((selection->x1 < p->xMax) ||
4253 (selection->x2 < p->xMax))
4254 if (begin == NULL)
4255 begin = p;
4256
4257 if (((selection->x1 > p->xMin) ||
4258 (selection->x2 > p->xMin)) && (begin != NULL)) {
4259 end = p->next;
4260 current = p;
4261 }
4262 } else {
4263 if ((selection->x1 > p->xMin) ||
4264 (selection->x2 > p->xMin))
4265 if (begin == NULL)
4266 begin = p;
4267
4268 if (((selection->x1 < p->xMax) ||
4269 (selection->x2 < p->xMax)) && (begin != NULL)) {
4270 end = p->next;
4271 current = p;
4272 }
4273 }
4274 }
4275
4276 if (!current)
4277 current = begin;
4278
4279 child_selection = *selection;
4280 if (style == selectionStyleWord) {
4281 child_selection.x1 = begin ? begin->xMin : xMin;
4282 if (end && end->xMax != -1) {
4283 child_selection.x2 = current->xMax;
4284 } else {
4285 child_selection.x2 = xMax;
4286 }
4287 }
4288
4289 edge_begin = len;
4290 edge_end = 0;
4291 for (i = 0; i < len; i++) {
4292 double mid = (edge[i] + edge[i + 1]) / 2;
4293 if (child_selection.x1 < mid || child_selection.x2 < mid)
4294 if (i < edge_begin)
4295 edge_begin = i;
4296 if (mid < child_selection.x2 || mid < child_selection.x1)
4297 edge_end = i + 1;
4298 }
4299
4300 /* Skip empty selection. */
4301 if (edge_end <= edge_begin)
4302 return;
4303
4304 visitor->visitLine (this, begin, end, edge_begin, edge_end,
4305 &child_selection);
4306
4307 for (p = begin; p != end; p = p->next)
4308 p->visitSelection (visitor, &child_selection, style);
4309 }
4310
visitSelection(TextSelectionVisitor * visitor,PDFRectangle * selection,SelectionStyle style)4311 void TextBlock::visitSelection(TextSelectionVisitor *visitor,
4312 PDFRectangle *selection,
4313 SelectionStyle style) {
4314 PDFRectangle child_selection;
4315 double x[2], y[2], d, best_d[2];
4316 TextLine *p, *best_line[2];
4317 int i, count = 0, best_count[2], start, stop;
4318 GBool all[2];
4319
4320 x[0] = selection->x1;
4321 y[0] = selection->y1;
4322 x[1] = selection->x2;
4323 y[1] = selection->y2;
4324
4325 for (i = 0; i < 2; i++) {
4326 // the first/last lines are often not nearest
4327 // the corners, so we have to force them to be
4328 // selected when the selection runs outside this
4329 // block.
4330 if (page->primaryLR) {
4331 all[i] = x[i] >= this->xMax && y[i] >= this->yMax;
4332 if (x[i] <= this->xMin && y[i] <= this->yMin) {
4333 best_line[i] = this->lines;
4334 best_count[i] = 1;
4335 } else {
4336 best_line[i] = NULL;
4337 best_count[i] = 0;
4338 }
4339 } else {
4340 all[i] = x[i] <= this->xMin && y[i] >= this->yMax;
4341 if (x[i] >= this->xMax && y[i] <= this->yMin) {
4342 best_line[i] = this->lines;
4343 best_count[i] = 1;
4344 } else {
4345 best_line[i] = NULL;
4346 best_count[i] = 0;
4347 }
4348 }
4349 best_d[i] = 0;
4350 }
4351
4352 // find the nearest line to the selection points
4353 // using the manhattan distance.
4354 for (p = this->lines; p; p = p->next) {
4355 count++;
4356 for (i = 0; i < 2; i++) {
4357 d = fmax(p->xMin - x[i], 0.0) +
4358 fmax(x[i] - p->xMax, 0.0) +
4359 fmax(p->yMin - y[i], 0.0) +
4360 fmax(y[i] - p->yMax, 0.0);
4361 if (!best_line[i] || all[i] ||
4362 d < best_d[i]) {
4363 best_line[i] = p;
4364 best_count[i] = count;
4365 best_d[i] = d;
4366 }
4367 }
4368 }
4369 // assert: best is always set.
4370 if (!best_line[0] || !best_line[1]) {
4371 return;
4372 }
4373
4374 // Now decide which point was first.
4375 if (best_count[0] < best_count[1] ||
4376 (best_count[0] == best_count[1] &&
4377 y[0] < y[1])) {
4378 start = 0;
4379 stop = 1;
4380 } else {
4381 start = 1;
4382 stop = 0;
4383 }
4384
4385 visitor->visitBlock(this, best_line[start], best_line[stop], selection);
4386
4387 for (p = best_line[start]; p; p = p->next) {
4388 if (page->primaryLR) {
4389 child_selection.x1 = p->xMin;
4390 child_selection.x2 = p->xMax;
4391 } else {
4392 child_selection.x1 = p->xMax;
4393 child_selection.x2 = p->xMin;
4394 }
4395 child_selection.y1 = p->yMin;
4396 child_selection.y2 = p->yMax;
4397 if (style == selectionStyleLine) {
4398 if (p == best_line[start]) {
4399 child_selection.x1 = 0;
4400 child_selection.y1 = 0;
4401 }
4402 if (p == best_line[stop]) {
4403 child_selection.x2 = page->pageWidth;
4404 child_selection.y2 = page->pageHeight;
4405 }
4406 } else {
4407 if (p == best_line[start]) {
4408 child_selection.x1 = fmax(p->xMin, fmin(p->xMax, x[start]));
4409 child_selection.y1 = fmax(p->yMin, fmin(p->yMax, y[start]));
4410 }
4411 if (p == best_line[stop]) {
4412 child_selection.x2 = fmax(p->xMin, fmin(p->xMax, x[stop]));
4413 child_selection.y2 = fmax(p->yMin, fmin(p->yMax, y[stop]));
4414 }
4415 }
4416 p->visitSelection(visitor, &child_selection, style);
4417 if (p == best_line[stop]) {
4418 return;
4419 }
4420 }
4421 }
4422
visitSelection(TextSelectionVisitor * visitor,PDFRectangle * selection,SelectionStyle style)4423 void TextPage::visitSelection(TextSelectionVisitor *visitor,
4424 PDFRectangle *selection,
4425 SelectionStyle style)
4426 {
4427 PDFRectangle child_selection;
4428 double x[2], y[2], d, best_d[2];
4429 double xMin, yMin, xMax, yMax;
4430 TextFlow *flow, *best_flow[2];
4431 TextBlock *blk, *best_block[2];
4432 int i, count = 0, best_count[2], start, stop;
4433
4434 if (!flows)
4435 return;
4436
4437 x[0] = selection->x1;
4438 y[0] = selection->y1;
4439 x[1] = selection->x2;
4440 y[1] = selection->y2;
4441
4442 xMin = pageWidth;
4443 yMin = pageHeight;
4444 xMax = 0.0;
4445 yMax = 0.0;
4446
4447 for (i = 0; i < 2; i++) {
4448 best_block[i] = NULL;
4449 best_flow[i] = NULL;
4450 best_count[i] = 0;
4451 best_d[i] = 0;
4452 }
4453
4454 // find the nearest blocks to the selection points
4455 // using the manhattan distance.
4456 for (flow = flows; flow; flow = flow->next) {
4457 for (blk = flow->blocks; blk; blk = blk->next) {
4458 count++;
4459 // the first/last blocks in reading order are
4460 // often not the closest to the page corners;
4461 // track the corners, force those blocks to
4462 // be selected if the selection runs across
4463 // multiple pages.
4464 xMin = fmin(xMin, blk->xMin);
4465 yMin = fmin(yMin, blk->yMin);
4466 xMax = fmax(xMax, blk->xMax);
4467 yMax = fmax(yMax, blk->yMax);
4468 for (i = 0; i < 2; i++) {
4469 d = fmax(blk->xMin - x[i], 0.0) +
4470 fmax(x[i] - blk->xMax, 0.0) +
4471 fmax(blk->yMin - y[i], 0.0) +
4472 fmax(y[i] - blk->yMax, 0.0);
4473 if (!best_block[i] ||
4474 d < best_d[i] ||
4475 (!blk->next && !flow->next &&
4476 x[i] > xMax && y[i] > yMax)) {
4477 best_block[i] = blk;
4478 best_flow[i] = flow;
4479 best_count[i] = count;
4480 best_d[i] = d;
4481 }
4482 }
4483 }
4484 }
4485 for (i = 0; i < 2; i++) {
4486 if (primaryLR) {
4487 if (x[i] < xMin && y[i] < yMin) {
4488 best_block[i] = flows->blocks;
4489 best_flow[i] = flows;
4490 best_count[i] = 1;
4491 }
4492 } else {
4493 if (x[i] > xMax && y[i] < yMin) {
4494 best_block[i] = flows->blocks;
4495 best_flow[i] = flows;
4496 best_count[i] = 1;
4497 }
4498 }
4499 }
4500 // assert: best is always set.
4501 if (!best_block[0] || !best_block[1]) {
4502 return;
4503 }
4504
4505 // Now decide which point was first.
4506 if (best_count[0] < best_count[1] ||
4507 (best_count[0] == best_count[1] && y[0] < y[1])) {
4508 start = 0;
4509 stop = 1;
4510 } else {
4511 start = 1;
4512 stop = 0;
4513 }
4514
4515 for (flow = best_flow[start]; flow; flow = flow->next) {
4516 if (flow == best_flow[start]) {
4517 blk = best_block[start];
4518 } else {
4519 blk = flow->blocks;
4520 }
4521 for (; blk; blk = blk->next) {
4522 if (primaryLR) {
4523 child_selection.x1 = blk->xMin;
4524 child_selection.x2 = blk->xMax;
4525 } else {
4526 child_selection.x1 = blk->xMax;
4527 child_selection.x2 = blk->xMin;
4528 }
4529 child_selection.y1 = blk->yMin;
4530 child_selection.y2 = blk->yMax;
4531 if (blk == best_block[start]) {
4532 child_selection.x1 = fmax(blk->xMin, fmin(blk->xMax, x[start]));
4533 child_selection.y1 = fmax(blk->yMin, fmin(blk->yMax, y[start]));
4534 }
4535 if (blk == best_block[stop]) {
4536 child_selection.x2 = fmax(blk->xMin, fmin(blk->xMax, x[stop]));
4537 child_selection.y2 = fmax(blk->yMin, fmin(blk->yMax, y[stop]));
4538 blk->visitSelection(visitor, &child_selection, style);
4539 return;
4540 }
4541 blk->visitSelection(visitor, &child_selection, style);
4542 }
4543 }
4544 }
4545
drawSelection(OutputDev * out,double scale,int rotation,PDFRectangle * selection,SelectionStyle style,GfxColor * glyph_color,GfxColor * box_color)4546 void TextPage::drawSelection(OutputDev *out,
4547 double scale,
4548 int rotation,
4549 PDFRectangle *selection,
4550 SelectionStyle style,
4551 GfxColor *glyph_color, GfxColor *box_color)
4552 {
4553 TextSelectionPainter painter(this, scale, rotation,
4554 out, box_color, glyph_color);
4555
4556 visitSelection(&painter, selection, style);
4557 }
4558
getSelectionRegion(PDFRectangle * selection,SelectionStyle style,double scale)4559 GooList *TextPage::getSelectionRegion(PDFRectangle *selection,
4560 SelectionStyle style,
4561 double scale) {
4562 TextSelectionSizer sizer(this, scale);
4563
4564 visitSelection(&sizer, selection, style);
4565
4566 return sizer.getRegion();
4567 }
4568
getSelectionText(PDFRectangle * selection,SelectionStyle style)4569 GooString *TextPage::getSelectionText(PDFRectangle *selection,
4570 SelectionStyle style)
4571 {
4572 TextSelectionDumper dumper(this);
4573
4574 visitSelection(&dumper, selection, style);
4575
4576 return dumper.getText();
4577 }
4578
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax)4579 GBool TextPage::findCharRange(int pos, int length,
4580 double *xMin, double *yMin,
4581 double *xMax, double *yMax) {
4582 TextBlock *blk;
4583 TextLine *line;
4584 TextWord *word;
4585 double xMin0, xMax0, yMin0, yMax0;
4586 double xMin1, xMax1, yMin1, yMax1;
4587 GBool first;
4588 int i, j0, j1;
4589
4590 if (rawOrder) {
4591 return gFalse;
4592 }
4593
4594 //~ this doesn't correctly handle:
4595 //~ - ranges split across multiple lines (the highlighted region
4596 //~ is the bounding box of all the parts of the range)
4597 //~ - cases where characters don't convert one-to-one into Unicode
4598 first = gTrue;
4599 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
4600 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
4601 for (i = 0; i < nBlocks; ++i) {
4602 blk = blocks[i];
4603 for (line = blk->lines; line; line = line->next) {
4604 for (word = line->words; word; word = word->next) {
4605 if (pos < word->charPos + word->charLen &&
4606 word->charPos < pos + length) {
4607 j0 = pos - word->charPos;
4608 if (j0 < 0) {
4609 j0 = 0;
4610 }
4611 j1 = pos + length - 1 - word->charPos;
4612 if (j1 >= word->len) {
4613 j1 = word->len - 1;
4614 }
4615 switch (line->rot) {
4616 case 0:
4617 xMin1 = word->edge[j0];
4618 xMax1 = word->edge[j1 + 1];
4619 yMin1 = word->yMin;
4620 yMax1 = word->yMax;
4621 break;
4622 case 1:
4623 xMin1 = word->xMin;
4624 xMax1 = word->xMax;
4625 yMin1 = word->edge[j0];
4626 yMax1 = word->edge[j1 + 1];
4627 break;
4628 case 2:
4629 xMin1 = word->edge[j1 + 1];
4630 xMax1 = word->edge[j0];
4631 yMin1 = word->yMin;
4632 yMax1 = word->yMax;
4633 break;
4634 case 3:
4635 xMin1 = word->xMin;
4636 xMax1 = word->xMax;
4637 yMin1 = word->edge[j1 + 1];
4638 yMax1 = word->edge[j0];
4639 break;
4640 }
4641 if (first || xMin1 < xMin0) {
4642 xMin0 = xMin1;
4643 }
4644 if (first || xMax1 > xMax0) {
4645 xMax0 = xMax1;
4646 }
4647 if (first || yMin1 < yMin0) {
4648 yMin0 = yMin1;
4649 }
4650 if (first || yMax1 > yMax0) {
4651 yMax0 = yMax1;
4652 }
4653 first = gFalse;
4654 }
4655 }
4656 }
4657 }
4658 if (!first) {
4659 *xMin = xMin0;
4660 *xMax = xMax0;
4661 *yMin = yMin0;
4662 *yMax = yMax0;
4663 return gTrue;
4664 }
4665 return gFalse;
4666 }
4667
dump(void * outputStream,TextOutputFunc outputFunc,GBool physLayout)4668 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
4669 GBool physLayout) {
4670 UnicodeMap *uMap;
4671 TextFlow *flow;
4672 TextBlock *blk;
4673 TextLine *line;
4674 TextLineFrag *frags;
4675 TextWord *word;
4676 int nFrags, fragsSize;
4677 TextLineFrag *frag;
4678 char space[8], eol[16], eop[8];
4679 int spaceLen, eolLen, eopLen;
4680 GBool pageBreaks;
4681 GooString *s;
4682 double delta;
4683 int col, i, j, d, n;
4684
4685 // get the output encoding
4686 if (!(uMap = globalParams->getTextEncoding())) {
4687 return;
4688 }
4689 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
4690 eolLen = 0; // make gcc happy
4691 switch (globalParams->getTextEOL()) {
4692 case eolUnix:
4693 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
4694 break;
4695 case eolDOS:
4696 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
4697 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
4698 break;
4699 case eolMac:
4700 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
4701 break;
4702 }
4703 eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
4704 pageBreaks = globalParams->getTextPageBreaks();
4705
4706 //~ writing mode (horiz/vert)
4707
4708 // output the page in raw (content stream) order
4709 if (rawOrder) {
4710
4711 for (word = rawWords; word; word = word->next) {
4712 s = new GooString();
4713 dumpFragment(word->text, word->len, uMap, s);
4714 (*outputFunc)(outputStream, s->getCString(), s->getLength());
4715 delete s;
4716 if (word->next &&
4717 fabs(word->next->base - word->base) <
4718 maxIntraLineDelta * word->fontSize) {
4719 if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
4720 (*outputFunc)(outputStream, space, spaceLen);
4721 }
4722 } else {
4723 (*outputFunc)(outputStream, eol, eolLen);
4724 }
4725 }
4726
4727 // output the page, maintaining the original physical layout
4728 } else if (physLayout) {
4729
4730 // collect the line fragments for the page and sort them
4731 fragsSize = 256;
4732 frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
4733 nFrags = 0;
4734 for (i = 0; i < nBlocks; ++i) {
4735 blk = blocks[i];
4736 for (line = blk->lines; line; line = line->next) {
4737 if (nFrags == fragsSize) {
4738 fragsSize *= 2;
4739 frags = (TextLineFrag *)greallocn(frags,
4740 fragsSize, sizeof(TextLineFrag));
4741 }
4742 frags[nFrags].init(line, 0, line->len);
4743 frags[nFrags].computeCoords(gTrue);
4744 ++nFrags;
4745 }
4746 }
4747 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
4748 i = 0;
4749 while (i < nFrags) {
4750 delta = maxIntraLineDelta * frags[i].line->words->fontSize;
4751 for (j = i+1;
4752 j < nFrags && fabs(frags[j].base - frags[i].base) < delta;
4753 ++j) ;
4754 qsort(frags + i, j - i, sizeof(TextLineFrag),
4755 &TextLineFrag::cmpXYColumnPrimaryRot);
4756 i = j;
4757 }
4758
4759 #if 0 // for debugging
4760 printf("*** line fragments ***\n");
4761 for (i = 0; i < nFrags; ++i) {
4762 frag = &frags[i];
4763 printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
4764 frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base);
4765 for (n = 0; n < frag->len; ++n) {
4766 fputc(frag->line->text[frag->start + n] & 0xff, stdout);
4767 }
4768 printf("'\n");
4769 }
4770 printf("\n");
4771 #endif
4772
4773 // generate output
4774 col = 0;
4775 for (i = 0; i < nFrags; ++i) {
4776 frag = &frags[i];
4777
4778 // column alignment
4779 for (; col < frag->col; ++col) {
4780 (*outputFunc)(outputStream, space, spaceLen);
4781 }
4782
4783 // print the line
4784 s = new GooString();
4785 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
4786 (*outputFunc)(outputStream, s->getCString(), s->getLength());
4787 delete s;
4788
4789 // print one or more returns if necessary
4790 if (i == nFrags - 1 ||
4791 frags[i+1].col < col ||
4792 fabs(frags[i+1].base - frag->base) >
4793 maxIntraLineDelta * frag->line->words->fontSize) {
4794 if (i < nFrags - 1) {
4795 d = (int)((frags[i+1].base - frag->base) /
4796 frag->line->words->fontSize);
4797 if (d < 1) {
4798 d = 1;
4799 } else if (d > 5) {
4800 d = 5;
4801 }
4802 } else {
4803 d = 1;
4804 }
4805 for (; d > 0; --d) {
4806 (*outputFunc)(outputStream, eol, eolLen);
4807 }
4808 col = 0;
4809 }
4810 }
4811
4812 gfree(frags);
4813
4814 // output the page, "undoing" the layout
4815 } else {
4816 for (flow = flows; flow; flow = flow->next) {
4817 for (blk = flow->blocks; blk; blk = blk->next) {
4818 for (line = blk->lines; line; line = line->next) {
4819 n = line->len;
4820 if (line->hyphenated && (line->next || blk->next)) {
4821 --n;
4822 }
4823 s = new GooString();
4824 dumpFragment(line->text, n, uMap, s);
4825 (*outputFunc)(outputStream, s->getCString(), s->getLength());
4826 delete s;
4827 // output a newline when a hyphen is not suppressed
4828 if (n == line->len) {
4829 (*outputFunc)(outputStream, eol, eolLen);
4830 }
4831 }
4832 }
4833 (*outputFunc)(outputStream, eol, eolLen);
4834 }
4835 }
4836
4837 // end of page
4838 if (pageBreaks) {
4839 (*outputFunc)(outputStream, eop, eopLen);
4840 }
4841
4842 uMap->decRefCnt();
4843 }
4844
assignColumns(TextLineFrag * frags,int nFrags,GBool oneRot)4845 void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
4846 TextLineFrag *frag0, *frag1;
4847 int rot, col1, col2, i, j, k;
4848
4849 // all text in the region has the same rotation -- recompute the
4850 // column numbers based only on the text in the region
4851 if (oneRot) {
4852 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
4853 rot = frags[0].line->rot;
4854 for (i = 0; i < nFrags; ++i) {
4855 frag0 = &frags[i];
4856 col1 = 0;
4857 for (j = 0; j < i; ++j) {
4858 frag1 = &frags[j];
4859 col2 = 0; // make gcc happy
4860 switch (rot) {
4861 case 0:
4862 if (frag0->xMin >= frag1->xMax) {
4863 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
4864 frag1->line->col[frag1->start]) + 1;
4865 } else {
4866 for (k = frag1->start;
4867 k < frag1->start + frag1->len &&
4868 frag0->xMin >= 0.5 * (frag1->line->edge[k] +
4869 frag1->line->edge[k+1]);
4870 ++k) ;
4871 col2 = frag1->col +
4872 frag1->line->col[k] - frag1->line->col[frag1->start];
4873 }
4874 break;
4875 case 1:
4876 if (frag0->yMin >= frag1->yMax) {
4877 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
4878 frag1->line->col[frag1->start]) + 1;
4879 } else {
4880 for (k = frag1->start;
4881 k < frag1->start + frag1->len &&
4882 frag0->yMin >= 0.5 * (frag1->line->edge[k] +
4883 frag1->line->edge[k+1]);
4884 ++k) ;
4885 col2 = frag1->col +
4886 frag1->line->col[k] - frag1->line->col[frag1->start];
4887 }
4888 break;
4889 case 2:
4890 if (frag0->xMax <= frag1->xMin) {
4891 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
4892 frag1->line->col[frag1->start]) + 1;
4893 } else {
4894 for (k = frag1->start;
4895 k < frag1->start + frag1->len &&
4896 frag0->xMax <= 0.5 * (frag1->line->edge[k] +
4897 frag1->line->edge[k+1]);
4898 ++k) ;
4899 col2 = frag1->col +
4900 frag1->line->col[k] - frag1->line->col[frag1->start];
4901 }
4902 break;
4903 case 3:
4904 if (frag0->yMax <= frag1->yMin) {
4905 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
4906 frag1->line->col[frag1->start]) + 1;
4907 } else {
4908 for (k = frag1->start;
4909 k < frag1->start + frag1->len &&
4910 frag0->yMax <= 0.5 * (frag1->line->edge[k] +
4911 frag1->line->edge[k+1]);
4912 ++k) ;
4913 col2 = frag1->col +
4914 frag1->line->col[k] - frag1->line->col[frag1->start];
4915 }
4916 break;
4917 }
4918 if (col2 > col1) {
4919 col1 = col2;
4920 }
4921 }
4922 frag0->col = col1;
4923 }
4924
4925 // the region includes text at different rotations -- use the
4926 // globally assigned column numbers, offset by the minimum column
4927 // number (i.e., shift everything over to column 0)
4928 } else {
4929 col1 = frags[0].col;
4930 for (i = 1; i < nFrags; ++i) {
4931 if (frags[i].col < col1) {
4932 col1 = frags[i].col;
4933 }
4934 }
4935 for (i = 0; i < nFrags; ++i) {
4936 frags[i].col -= col1;
4937 }
4938 }
4939 }
4940
dumpFragment(Unicode * text,int len,UnicodeMap * uMap,GooString * s)4941 int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
4942 GooString *s) {
4943 char lre[8], rle[8], popdf[8], buf[8];
4944 int lreLen, rleLen, popdfLen, n;
4945 int nCols, i, j, k;
4946
4947 nCols = 0;
4948
4949 if (uMap->isUnicode()) {
4950
4951 lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
4952 rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
4953 popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
4954
4955 if (primaryLR) {
4956
4957 i = 0;
4958 while (i < len) {
4959 // output a left-to-right section
4960 for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
4961 for (k = i; k < j; ++k) {
4962 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
4963 s->append(buf, n);
4964 ++nCols;
4965 }
4966 i = j;
4967 // output a right-to-left section
4968 for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ;
4969 if (j > i) {
4970 s->append(rle, rleLen);
4971 for (k = j - 1; k >= i; --k) {
4972 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
4973 s->append(buf, n);
4974 ++nCols;
4975 }
4976 s->append(popdf, popdfLen);
4977 i = j;
4978 }
4979 }
4980
4981 } else {
4982
4983 s->append(rle, rleLen);
4984 i = len - 1;
4985 while (i >= 0) {
4986 // output a right-to-left section
4987 for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ;
4988 for (k = i; k > j; --k) {
4989 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
4990 s->append(buf, n);
4991 ++nCols;
4992 }
4993 i = j;
4994 // output a left-to-right section
4995 for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
4996 if (j < i) {
4997 s->append(lre, lreLen);
4998 for (k = j + 1; k <= i; ++k) {
4999 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
5000 s->append(buf, n);
5001 ++nCols;
5002 }
5003 s->append(popdf, popdfLen);
5004 i = j;
5005 }
5006 }
5007 s->append(popdf, popdfLen);
5008
5009 }
5010
5011 } else {
5012 for (i = 0; i < len; ++i) {
5013 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
5014 s->append(buf, n);
5015 nCols += n;
5016 }
5017 }
5018
5019 return nCols;
5020 }
5021
5022 #if TEXTOUT_WORD_LIST
makeWordList(GBool physLayout)5023 TextWordList *TextPage::makeWordList(GBool physLayout) {
5024 return new TextWordList(this, physLayout);
5025 }
5026 #endif
5027
5028 //------------------------------------------------------------------------
5029 // ActualText
5030 //------------------------------------------------------------------------
ActualText(TextPage * out)5031 ActualText::ActualText(TextPage *out) {
5032 out->incRefCnt();
5033 text = out;
5034 actualText = NULL;
5035 actualTextBMCLevel = 0;
5036 }
5037
~ActualText()5038 ActualText::~ActualText() {
5039 if (actualText)
5040 delete actualText;
5041 text->decRefCnt();
5042 }
5043
addChar(GfxState * state,double x,double y,double dx,double dy,CharCode c,int nBytes,Unicode * u,int uLen)5044 void ActualText::addChar(GfxState *state, double x, double y,
5045 double dx, double dy,
5046 CharCode c, int nBytes, Unicode *u, int uLen) {
5047 if (actualTextBMCLevel == 0) {
5048 text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5049 } else {
5050 // Inside ActualText span.
5051 if (newActualTextSpan) {
5052 actualText_x = x;
5053 actualText_y = y;
5054 actualText_dx = dx;
5055 actualText_dy = dy;
5056 newActualTextSpan = gFalse;
5057 } else {
5058 if (x < actualText_x)
5059 actualText_x = x;
5060 if (y < actualText_y)
5061 actualText_y = y;
5062 if (x + dx > actualText_x + actualText_dx)
5063 actualText_dx = x + dx - actualText_x;
5064 if (y + dy > actualText_y + actualText_dy)
5065 actualText_dy = y + dy - actualText_y;
5066 }
5067 }
5068 }
5069
beginMC(Dict * properties)5070 void ActualText::beginMC(Dict *properties) {
5071 if (actualTextBMCLevel > 0) {
5072 // Already inside a ActualText span.
5073 actualTextBMCLevel++;
5074 return;
5075 }
5076
5077 Object obj;
5078 if (properties && properties->lookup("ActualText", &obj)) {
5079 if (obj.isString()) {
5080 actualText = obj.getString();
5081 actualTextBMCLevel = 1;
5082 newActualTextSpan = gTrue;
5083 }
5084 }
5085 }
5086
endMC(GfxState * state)5087 void ActualText::endMC(GfxState *state) {
5088 char *uniString = NULL;
5089 Unicode *uni;
5090 int length, i;
5091
5092 if (actualTextBMCLevel > 0) {
5093 actualTextBMCLevel--;
5094 if (actualTextBMCLevel == 0) {
5095 // ActualText span closed. Output the span text and the
5096 // extents of all the glyphs inside the span
5097
5098 if (newActualTextSpan) {
5099 // No content inside span.
5100 actualText_x = state->getCurX();
5101 actualText_y = state->getCurY();
5102 actualText_dx = 0;
5103 actualText_dy = 0;
5104 }
5105
5106 if (!actualText->hasUnicodeMarker()) {
5107 if (actualText->getLength() > 0) {
5108 //non-unicode string -- assume pdfDocEncoding and
5109 //try to convert to UTF16BE
5110 uniString = pdfDocEncodingToUTF16(actualText, &length);
5111 } else {
5112 length = 0;
5113 }
5114 } else {
5115 uniString = actualText->getCString();
5116 length = actualText->getLength();
5117 }
5118
5119 if (length < 3)
5120 length = 0;
5121 else
5122 length = length/2 - 1;
5123 uni = new Unicode[length];
5124 for (i = 0 ; i < length; i++)
5125 uni[i] = ((uniString[2 + i*2] & 0xff)<<8)|(uniString[3 + i*2] & 0xff);
5126
5127 text->addChar(state,
5128 actualText_x, actualText_y,
5129 actualText_dx, actualText_dy,
5130 0, 1, uni, length);
5131
5132 delete [] uni;
5133 if (!actualText->hasUnicodeMarker())
5134 delete [] uniString;
5135 delete actualText;
5136 actualText = NULL;
5137 }
5138 }
5139 }
5140
5141 //------------------------------------------------------------------------
5142 // TextOutputDev
5143 //------------------------------------------------------------------------
5144
TextOutputDev_outputToFile(void * stream,char * text,int len)5145 static void TextOutputDev_outputToFile(void *stream, char *text, int len) {
5146 fwrite(text, 1, len, (FILE *)stream);
5147 }
5148
TextOutputDev(char * fileName,GBool physLayoutA,GBool rawOrderA,GBool append)5149 TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
5150 GBool rawOrderA, GBool append) {
5151 text = NULL;
5152 physLayout = physLayoutA;
5153 rawOrder = rawOrderA;
5154 doHTML = gFalse;
5155 ok = gTrue;
5156
5157 // open file
5158 needClose = gFalse;
5159 if (fileName) {
5160 if (!strcmp(fileName, "-")) {
5161 outputStream = stdout;
5162 #ifdef _WIN32
5163 // keep DOS from munging the end-of-line characters
5164 setmode(fileno(stdout), O_BINARY);
5165 #endif
5166 } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
5167 needClose = gTrue;
5168 } else {
5169 error(-1, "Couldn't open text file '%s'", fileName);
5170 ok = gFalse;
5171 actualText = NULL;
5172 return;
5173 }
5174 outputFunc = &TextOutputDev_outputToFile;
5175 } else {
5176 outputStream = NULL;
5177 }
5178
5179 // set up text object
5180 text = new TextPage(rawOrderA);
5181 actualText = new ActualText(text);
5182 }
5183
TextOutputDev(TextOutputFunc func,void * stream,GBool physLayoutA,GBool rawOrderA)5184 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
5185 GBool physLayoutA, GBool rawOrderA) {
5186 outputFunc = func;
5187 outputStream = stream;
5188 needClose = gFalse;
5189 physLayout = physLayoutA;
5190 rawOrder = rawOrderA;
5191 doHTML = gFalse;
5192 text = new TextPage(rawOrderA);
5193 actualText = new ActualText(text);
5194 ok = gTrue;
5195 }
5196
~TextOutputDev()5197 TextOutputDev::~TextOutputDev() {
5198 if (needClose) {
5199 #ifdef MACOS
5200 ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
5201 #endif
5202 fclose((FILE *)outputStream);
5203 }
5204 if (text) {
5205 text->decRefCnt();
5206 }
5207 delete actualText;
5208 }
5209
startPage(int pageNum,GfxState * state)5210 void TextOutputDev::startPage(int pageNum, GfxState *state) {
5211 text->startPage(state);
5212 }
5213
endPage()5214 void TextOutputDev::endPage() {
5215 text->endPage();
5216 text->coalesce(physLayout, doHTML);
5217 if (outputStream) {
5218 text->dump(outputStream, outputFunc, physLayout);
5219 }
5220 }
5221
updateFont(GfxState * state)5222 void TextOutputDev::updateFont(GfxState *state) {
5223 text->updateFont(state);
5224 }
5225
beginString(GfxState * state,GooString * s)5226 void TextOutputDev::beginString(GfxState *state, GooString *s) {
5227 }
5228
endString(GfxState * state)5229 void TextOutputDev::endString(GfxState *state) {
5230 }
5231
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode c,int nBytes,Unicode * u,int uLen)5232 void TextOutputDev::drawChar(GfxState *state, double x, double y,
5233 double dx, double dy,
5234 double originX, double originY,
5235 CharCode c, int nBytes, Unicode *u, int uLen) {
5236 actualText->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
5237 }
5238
beginMarkedContent(char * name,Dict * properties)5239 void TextOutputDev::beginMarkedContent(char *name, Dict *properties)
5240 {
5241 actualText->beginMC(properties);
5242 }
5243
endMarkedContent(GfxState * state)5244 void TextOutputDev::endMarkedContent(GfxState *state)
5245 {
5246 actualText->endMC(state);
5247 }
5248
stroke(GfxState * state)5249 void TextOutputDev::stroke(GfxState *state) {
5250 GfxPath *path;
5251 GfxSubpath *subpath;
5252 double x[2], y[2];
5253
5254 if (!doHTML) {
5255 return;
5256 }
5257 path = state->getPath();
5258 if (path->getNumSubpaths() != 1) {
5259 return;
5260 }
5261 subpath = path->getSubpath(0);
5262 if (subpath->getNumPoints() != 2) {
5263 return;
5264 }
5265 state->transform(subpath->getX(0), subpath->getY(0), &x[0], &y[0]);
5266 state->transform(subpath->getX(1), subpath->getY(1), &x[1], &y[1]);
5267
5268 // look for a vertical or horizontal line
5269 if (x[0] == x[1] || y[0] == y[1]) {
5270 text->addUnderline(x[0], y[0], x[1], y[1]);
5271 }
5272 }
5273
fill(GfxState * state)5274 void TextOutputDev::fill(GfxState *state) {
5275 GfxPath *path;
5276 GfxSubpath *subpath;
5277 double x[5], y[5];
5278 double rx0, ry0, rx1, ry1, t;
5279 int i;
5280
5281 if (!doHTML) {
5282 return;
5283 }
5284 path = state->getPath();
5285 if (path->getNumSubpaths() != 1) {
5286 return;
5287 }
5288 subpath = path->getSubpath(0);
5289 if (subpath->getNumPoints() != 5) {
5290 return;
5291 }
5292 for (i = 0; i < 5; ++i) {
5293 if (subpath->getCurve(i)) {
5294 return;
5295 }
5296 state->transform(subpath->getX(i), subpath->getY(i), &x[i], &y[i]);
5297 }
5298
5299 // look for a rectangle
5300 if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] &&
5301 x[0] == x[4] && y[0] == y[4]) {
5302 rx0 = x[0];
5303 ry0 = y[0];
5304 rx1 = x[2];
5305 ry1 = y[1];
5306 } else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] &&
5307 x[0] == x[4] && y[0] == y[4]) {
5308 rx0 = x[0];
5309 ry0 = y[0];
5310 rx1 = x[1];
5311 ry1 = y[2];
5312 } else {
5313 return;
5314 }
5315 if (rx1 < rx0) {
5316 t = rx0;
5317 rx0 = rx1;
5318 rx1 = t;
5319 }
5320 if (ry1 < ry0) {
5321 t = ry0;
5322 ry0 = ry1;
5323 ry1 = t;
5324 }
5325
5326 // skinny horizontal rectangle
5327 if (ry1 - ry0 < rx1 - rx0) {
5328 if (ry1 - ry0 < maxUnderlineWidth) {
5329 ry0 = 0.5 * (ry0 + ry1);
5330 text->addUnderline(rx0, ry0, rx1, ry0);
5331 }
5332
5333 // skinny vertical rectangle
5334 } else {
5335 if (rx1 - rx0 < maxUnderlineWidth) {
5336 rx0 = 0.5 * (rx0 + rx1);
5337 text->addUnderline(rx0, ry0, rx0, ry1);
5338 }
5339 }
5340 }
5341
eoFill(GfxState * state)5342 void TextOutputDev::eoFill(GfxState *state) {
5343 if (!doHTML) {
5344 return;
5345 }
5346 fill(state);
5347 }
5348
processLink(Link * link,Catalog *)5349 void TextOutputDev::processLink(Link *link, Catalog * /*catalog*/) {
5350 double x1, y1, x2, y2;
5351 int xMin, yMin, xMax, yMax, x, y;
5352
5353 if (!doHTML) {
5354 return;
5355 }
5356 link->getRect(&x1, &y1, &x2, &y2);
5357 cvtUserToDev(x1, y1, &x, &y);
5358 xMin = xMax = x;
5359 yMin = yMax = y;
5360 cvtUserToDev(x1, y2, &x, &y);
5361 if (x < xMin) {
5362 xMin = x;
5363 } else if (x > xMax) {
5364 xMax = x;
5365 }
5366 if (y < yMin) {
5367 yMin = y;
5368 } else if (y > yMax) {
5369 yMax = y;
5370 }
5371 cvtUserToDev(x2, y1, &x, &y);
5372 if (x < xMin) {
5373 xMin = x;
5374 } else if (x > xMax) {
5375 xMax = x;
5376 }
5377 if (y < yMin) {
5378 yMin = y;
5379 } else if (y > yMax) {
5380 yMax = y;
5381 }
5382 cvtUserToDev(x2, y2, &x, &y);
5383 if (x < xMin) {
5384 xMin = x;
5385 } else if (x > xMax) {
5386 xMax = x;
5387 }
5388 if (y < yMin) {
5389 yMin = y;
5390 } else if (y > yMax) {
5391 yMax = y;
5392 }
5393 text->addLink(xMin, yMin, xMax, yMax, link);
5394 }
5395
findText(Unicode * s,int len,GBool startAtTop,GBool stopAtBottom,GBool startAtLast,GBool stopAtLast,GBool caseSensitive,GBool backward,double * xMin,double * yMin,double * xMax,double * yMax)5396 GBool TextOutputDev::findText(Unicode *s, int len,
5397 GBool startAtTop, GBool stopAtBottom,
5398 GBool startAtLast, GBool stopAtLast,
5399 GBool caseSensitive, GBool backward,
5400 double *xMin, double *yMin,
5401 double *xMax, double *yMax) {
5402 return text->findText(s, len, startAtTop, stopAtBottom,
5403 startAtLast, stopAtLast, caseSensitive, backward,
5404 xMin, yMin, xMax, yMax);
5405 }
5406
getText(double xMin,double yMin,double xMax,double yMax)5407 GooString *TextOutputDev::getText(double xMin, double yMin,
5408 double xMax, double yMax) {
5409 return text->getText(xMin, yMin, xMax, yMax);
5410 }
5411
drawSelection(OutputDev * out,double scale,int rotation,PDFRectangle * selection,SelectionStyle style,GfxColor * glyph_color,GfxColor * box_color)5412 void TextOutputDev::drawSelection(OutputDev *out,
5413 double scale,
5414 int rotation,
5415 PDFRectangle *selection,
5416 SelectionStyle style,
5417 GfxColor *glyph_color, GfxColor *box_color) {
5418 text->drawSelection(out, scale, rotation, selection, style, glyph_color, box_color);
5419 }
5420
getSelectionRegion(PDFRectangle * selection,SelectionStyle style,double scale)5421 GooList *TextOutputDev::getSelectionRegion(PDFRectangle *selection,
5422 SelectionStyle style,
5423 double scale) {
5424 return text->getSelectionRegion(selection, style, scale);
5425 }
5426
getSelectionText(PDFRectangle * selection,SelectionStyle style)5427 GooString *TextOutputDev::getSelectionText(PDFRectangle *selection,
5428 SelectionStyle style)
5429 {
5430 return text->getSelectionText(selection, style);
5431 }
5432
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax)5433 GBool TextOutputDev::findCharRange(int pos, int length,
5434 double *xMin, double *yMin,
5435 double *xMax, double *yMax) {
5436 return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
5437 }
5438
5439 #if TEXTOUT_WORD_LIST
makeWordList()5440 TextWordList *TextOutputDev::makeWordList() {
5441 return text->makeWordList(physLayout);
5442 }
5443 #endif
5444
takeText()5445 TextPage *TextOutputDev::takeText() {
5446 TextPage *ret;
5447
5448 ret = text;
5449 text = new TextPage(rawOrder);
5450 return ret;
5451 }
5452