1 //========================================================================
2 //
3 // TextOutputDev.cc
4 //
5 // Copyright 1997-2014 Glyph & Cog, LLC
6 //
7 //========================================================================
8
9 #include <aconf.h>
10
11 #ifdef USE_GCC_PRAGMAS
12 #pragma implementation
13 #endif
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <stddef.h>
18 #include <math.h>
19 #include <ctype.h>
20 #ifdef _WIN32
21 #include <fcntl.h> // for O_BINARY
22 #include <io.h> // for setmode
23 #endif
24 #include "gmem.h"
25 #include "GString.h"
26 #include "GList.h"
27 #include "config.h"
28 #include "Error.h"
29 #include "GlobalParams.h"
30 #include "UnicodeMap.h"
31 #include "UnicodeTypeTable.h"
32 #include "GfxState.h"
33 #include "Link.h"
34 #include "TextOutputDev.h"
35
36 //------------------------------------------------------------------------
37 // parameters
38 //------------------------------------------------------------------------
39
40 // Size of bins used for horizontal and vertical profiles is
41 // splitPrecisionMul * minFontSize.
42 #define splitPrecisionMul 0.05
43
44 // Minimum allowed split precision.
45 #define minSplitPrecision 0.01
46
47 // yMin and yMax (or xMin and xMax for rot=1,3) are adjusted by this
48 // fraction of the text height, to allow for slightly overlapping
49 // lines (or large ascent/descent values).
50 #define ascentAdjustFactor 0
51 #define descentAdjustFactor 0.35
52
53 // Gaps larger than max{gap} - splitGapSlack * avgFontSize are
54 // considered to be equivalent.
55 #define splitGapSlack 0.2
56
57 // The vertical gap threshold (minimum gap required to split
58 // vertically) depends on the (approximate) number of lines in the
59 // block:
60 // threshold = (max + slope * nLines) * avgFontSize
61 // with a min value of vertGapThresholdMin * avgFontSize.
62 #define vertGapThresholdMin 0.8
63 #define vertGapThresholdMax 3
64 #define vertGapThresholdSlope -0.5
65
66 // Vertical gap threshold for table mode.
67 #define vertGapThresholdTableMin 0.2
68 #define vertGapThresholdTableMax 0.5
69 #define vertGapThresholdTableSlope -0.02
70
71 // A large character has a font size larger than
72 // largeCharThreshold * avgFontSize.
73 #define largeCharThreshold 1.5
74
75 // A block will be split vertically only if the resulting chunk
76 // widths are greater than vertSplitChunkThreshold * avgFontSize.
77 #define vertSplitChunkThreshold 2
78
79 // Max difference in primary,secondary coordinates (as a fraction of
80 // the font size) allowed for duplicated text (fake boldface, drop
81 // shadows) which is to be discarded.
82 #define dupMaxPriDelta 0.1
83 #define dupMaxSecDelta 0.2
84
85 // Inter-character spacing that varies by less than this multiple of
86 // font size is assumed to be equivalent.
87 #define uniformSpacing 0.07
88
89 // Typical word spacing, as a fraction of font size. This will be
90 // added to the minimum inter-character spacing, to account for wide
91 // character spacing.
92 #define wordSpacing 0.1
93
94 // Minimum paragraph indent from left margin, as a fraction of font
95 // size.
96 #define minParagraphIndent 0.5
97
98 // If the space between two lines is greater than
99 // paragraphSpacingThreshold * avgLineSpacing, start a new paragraph.
100 #define paragraphSpacingThreshold 1.2
101
102 // If font size changes by at least this much (measured in points)
103 // between lines, start a new paragraph.
104 #define paragraphFontSizeDelta 1
105
106 // Spaces at the start of a line in physical layout mode are this wide
107 // (as a multiple of font size).
108 #define physLayoutSpaceWidth 0.33
109
110 // Table cells (TextColumns) are allowed to overlap by this much
111 // in table layout mode (as a fraction of cell width or height).
112 #define tableCellOverlapSlack 0.05
113
114 // Primary axis delta which will cause a line break in raw mode
115 // (as a fraction of font size).
116 #define rawModeLineDelta 0.5
117
118 // Secondary axis delta which will cause a word break in raw mode
119 // (as a fraction of font size).
120 #define rawModeWordSpacing 0.15
121
122 // Secondary axis overlap which will cause a line break in raw mode
123 // (as a fraction of font size).
124 #define rawModeCharOverlap 0.2
125
126 // Max spacing (as a multiple of font size) allowed between the end of
127 // a line and a clipped character to be included in that line.
128 #define clippedTextMaxWordSpace 0.5
129
130 // Max width of underlines (in points).
131 #define maxUnderlineWidth 3
132
133 // Max horizontal distance between edge of word and start of underline
134 // (as a fraction of font size).
135 #define underlineSlack 0.2
136
137 // Max vertical distance between baseline of word and start of
138 // underline (as a fraction of font size).
139 #define underlineBaselineSlack 0.2
140
141 // Max distance between edge of text and edge of link border (as a
142 // fraction of font size).
143 #define hyperlinkSlack 0.2
144
145 //------------------------------------------------------------------------
146 // TextChar
147 //------------------------------------------------------------------------
148
149 class TextChar {
150 public:
151
152 TextChar(Unicode cA, int charPosA, int charLenA,
153 double xMinA, double yMinA, double xMaxA, double yMaxA,
154 int rotA, GBool clippedA, GBool invisibleA,
155 TextFontInfo *fontA, double fontSizeA,
156 double colorRA, double colorGA, double colorBA);
157
158 static int cmpX(const void *p1, const void *p2);
159 static int cmpY(const void *p1, const void *p2);
160
161 Unicode c;
162 int charPos;
163 int charLen;
164 double xMin, yMin, xMax, yMax;
165 Guchar rot;
166 char clipped;
167 char invisible;
168 TextFontInfo *font;
169 double fontSize;
170 double colorR,
171 colorG,
172 colorB;
173 };
174
TextChar(Unicode cA,int charPosA,int charLenA,double xMinA,double yMinA,double xMaxA,double yMaxA,int rotA,GBool clippedA,GBool invisibleA,TextFontInfo * fontA,double fontSizeA,double colorRA,double colorGA,double colorBA)175 TextChar::TextChar(Unicode cA, int charPosA, int charLenA,
176 double xMinA, double yMinA, double xMaxA, double yMaxA,
177 int rotA, GBool clippedA, GBool invisibleA,
178 TextFontInfo *fontA, double fontSizeA,
179 double colorRA, double colorGA, double colorBA) {
180 double t;
181
182 c = cA;
183 charPos = charPosA;
184 charLen = charLenA;
185 xMin = xMinA;
186 yMin = yMinA;
187 xMax = xMaxA;
188 yMax = yMaxA;
189 // this can happen with vertical writing mode, or with odd values
190 // for the char/word spacing parameters
191 if (xMin > xMax) {
192 t = xMin; xMin = xMax; xMax = t;
193 }
194 if (yMin > yMax) {
195 t = yMin; yMin = yMax; yMax = t;
196 }
197 rot = (Guchar)rotA;
198 clipped = (char)clippedA;
199 invisible = (char)invisibleA;
200 font = fontA;
201 fontSize = fontSizeA;
202 colorR = colorRA;
203 colorG = colorGA;
204 colorB = colorBA;
205 }
206
cmpX(const void * p1,const void * p2)207 int TextChar::cmpX(const void *p1, const void *p2) {
208 const TextChar *ch1 = *(const TextChar **)p1;
209 const TextChar *ch2 = *(const TextChar **)p2;
210
211 if (ch1->xMin < ch2->xMin) {
212 return -1;
213 } else if (ch1->xMin > ch2->xMin) {
214 return 1;
215 } else {
216 return 0;
217 }
218 }
219
cmpY(const void * p1,const void * p2)220 int TextChar::cmpY(const void *p1, const void *p2) {
221 const TextChar *ch1 = *(const TextChar **)p1;
222 const TextChar *ch2 = *(const TextChar **)p2;
223
224 if (ch1->yMin < ch2->yMin) {
225 return -1;
226 } else if (ch1->yMin > ch2->yMin) {
227 return 1;
228 } else {
229 return 0;
230 }
231 }
232
233 //------------------------------------------------------------------------
234 // TextBlock
235 //------------------------------------------------------------------------
236
237 enum TextBlockType {
238 blkVertSplit,
239 blkHorizSplit,
240 blkLeaf
241 };
242
243 enum TextBlockTag {
244 blkTagMulticolumn,
245 blkTagColumn,
246 blkTagLine
247 };
248
249 class TextBlock {
250 public:
251
252 TextBlock(TextBlockType typeA, int rotA);
253 ~TextBlock();
254 void addChild(TextBlock *child);
255 void addChild(TextChar *child);
256 void prependChild(TextChar *child);
257 void updateBounds(int childIdx);
258
259 TextBlockType type;
260 TextBlockTag tag;
261 int rot;
262 double xMin, yMin, xMax, yMax;
263 GBool smallSplit; // true for blkVertSplit/blkHorizSplit
264 // where the gap size is small
265 GList *children; // for blkLeaf, children are TextWord;
266 // for others, children are TextBlock
267 };
268
TextBlock(TextBlockType typeA,int rotA)269 TextBlock::TextBlock(TextBlockType typeA, int rotA) {
270 type = typeA;
271 tag = blkTagMulticolumn;
272 rot = rotA;
273 xMin = yMin = xMax = yMax = 0;
274 smallSplit = gFalse;
275 children = new GList();
276 }
277
~TextBlock()278 TextBlock::~TextBlock() {
279 if (type == blkLeaf) {
280 delete children;
281 } else {
282 deleteGList(children, TextBlock);
283 }
284 }
285
addChild(TextBlock * child)286 void TextBlock::addChild(TextBlock *child) {
287 if (children->getLength() == 0) {
288 xMin = child->xMin;
289 yMin = child->yMin;
290 xMax = child->xMax;
291 yMax = child->yMax;
292 } else {
293 if (child->xMin < xMin) {
294 xMin = child->xMin;
295 }
296 if (child->yMin < yMin) {
297 yMin = child->yMin;
298 }
299 if (child->xMax > xMax) {
300 xMax = child->xMax;
301 }
302 if (child->yMax > yMax) {
303 yMax = child->yMax;
304 }
305 }
306 children->append(child);
307 }
308
addChild(TextChar * child)309 void TextBlock::addChild(TextChar *child) {
310 if (children->getLength() == 0) {
311 xMin = child->xMin;
312 yMin = child->yMin;
313 xMax = child->xMax;
314 yMax = child->yMax;
315 } else {
316 if (child->xMin < xMin) {
317 xMin = child->xMin;
318 }
319 if (child->yMin < yMin) {
320 yMin = child->yMin;
321 }
322 if (child->xMax > xMax) {
323 xMax = child->xMax;
324 }
325 if (child->yMax > yMax) {
326 yMax = child->yMax;
327 }
328 }
329 children->append(child);
330 }
331
prependChild(TextChar * child)332 void TextBlock::prependChild(TextChar *child) {
333 if (children->getLength() == 0) {
334 xMin = child->xMin;
335 yMin = child->yMin;
336 xMax = child->xMax;
337 yMax = child->yMax;
338 } else {
339 if (child->xMin < xMin) {
340 xMin = child->xMin;
341 }
342 if (child->yMin < yMin) {
343 yMin = child->yMin;
344 }
345 if (child->xMax > xMax) {
346 xMax = child->xMax;
347 }
348 if (child->yMax > yMax) {
349 yMax = child->yMax;
350 }
351 }
352 children->insert(0, child);
353 }
354
updateBounds(int childIdx)355 void TextBlock::updateBounds(int childIdx) {
356 TextBlock *child;
357
358 child = (TextBlock *)children->get(childIdx);
359 if (child->xMin < xMin) {
360 xMin = child->xMin;
361 }
362 if (child->yMin < yMin) {
363 yMin = child->yMin;
364 }
365 if (child->xMax > xMax) {
366 xMax = child->xMax;
367 }
368 if (child->yMax > yMax) {
369 yMax = child->yMax;
370 }
371 }
372
373 //------------------------------------------------------------------------
374 // TextUnderline
375 //------------------------------------------------------------------------
376
377 class TextUnderline {
378 public:
379
TextUnderline(double x0A,double y0A,double x1A,double y1A)380 TextUnderline(double x0A, double y0A, double x1A, double y1A)
381 { x0 = x0A; y0 = y0A; x1 = x1A; y1 = y1A; horiz = y0 == y1; }
~TextUnderline()382 ~TextUnderline() {}
383
384 double x0, y0, x1, y1;
385 GBool horiz;
386 };
387
388 //------------------------------------------------------------------------
389 // TextLink
390 //------------------------------------------------------------------------
391
392 class TextLink {
393 public:
394
TextLink(double xMinA,double yMinA,double xMaxA,double yMaxA,GString * uriA)395 TextLink(double xMinA, double yMinA, double xMaxA, double yMaxA,
396 GString *uriA)
397 { xMin = xMinA; yMin = yMinA; xMax = xMaxA; yMax = yMaxA; uri = uriA; }
398 ~TextLink();
399
400 double xMin, yMin, xMax, yMax;
401 GString *uri;
402 };
403
~TextLink()404 TextLink::~TextLink() {
405 if (uri) {
406 delete uri;
407 }
408 }
409
410 //------------------------------------------------------------------------
411 // TextOutputControl
412 //------------------------------------------------------------------------
413
TextOutputControl()414 TextOutputControl::TextOutputControl() {
415 mode = textOutReadingOrder;
416 fixedPitch = 0;
417 fixedLineSpacing = 0;
418 html = gFalse;
419 clipText = gFalse;
420 }
421
422
423 //------------------------------------------------------------------------
424 // TextFontInfo
425 //------------------------------------------------------------------------
426
TextFontInfo(GfxState * state)427 TextFontInfo::TextFontInfo(GfxState *state) {
428 GfxFont *gfxFont;
429
430 gfxFont = state->getFont();
431 if (gfxFont) {
432 fontID = *gfxFont->getID();
433 ascent = gfxFont->getAscent();
434 descent = gfxFont->getDescent();
435 // "odd" ascent/descent values cause trouble more often than not
436 // (in theory these could be legitimate values for oddly designed
437 // fonts -- but they are more often due to buggy PDF generators)
438 // (values that are too small are a different issue -- those seem
439 // to be more commonly legitimate)
440 if (ascent > 1) {
441 ascent = 0.75;
442 }
443 if (descent < -0.5) {
444 descent = -0.25;
445 }
446 } else {
447 fontID.num = -1;
448 fontID.gen = -1;
449 ascent = 0.75;
450 descent = -0.25;
451 }
452 fontName = (gfxFont && gfxFont->getName()) ? gfxFont->getName()->copy()
453 : (GString *)NULL;
454 flags = gfxFont ? gfxFont->getFlags() : 0;
455 mWidth = 0;
456 if (gfxFont && !gfxFont->isCIDFont()) {
457 char *name;
458 int code;
459 for (code = 0; code < 256; ++code) {
460 if ((name = ((Gfx8BitFont *)gfxFont)->getCharName(code)) &&
461 name[0] == 'm' && name[1] == '\0') {
462 mWidth = ((Gfx8BitFont *)gfxFont)->getWidth(code);
463 break;
464 }
465 }
466 }
467 }
468
~TextFontInfo()469 TextFontInfo::~TextFontInfo() {
470 if (fontName) {
471 delete fontName;
472 }
473 }
474
matches(GfxState * state)475 GBool TextFontInfo::matches(GfxState *state) {
476 Ref *id;
477
478 if (!state->getFont()) {
479 return gFalse;
480 }
481 id = state->getFont()->getID();
482 return id->num == fontID.num && id->gen == fontID.gen;
483 }
484
485 //------------------------------------------------------------------------
486 // TextWord
487 //------------------------------------------------------------------------
488
489 // Build a TextWord object, using chars[start .. start+len-1].
490 // (If rot >= 2, the chars list is in reverse order.)
TextWord(GList * chars,int start,int lenA,int rotA,GBool spaceAfterA)491 TextWord::TextWord(GList *chars, int start, int lenA,
492 int rotA, GBool spaceAfterA) {
493 TextChar *ch;
494 int i;
495
496 rot = rotA;
497 len = lenA;
498 text = (Unicode *)gmallocn(len, sizeof(Unicode));
499 edge = (double *)gmallocn(len + 1, sizeof(double));
500 charPos = (int *)gmallocn(len + 1, sizeof(int));
501 switch (rot) {
502 case 0:
503 default:
504 ch = (TextChar *)chars->get(start);
505 xMin = ch->xMin;
506 yMin = ch->yMin;
507 yMax = ch->yMax;
508 ch = (TextChar *)chars->get(start + len - 1);
509 xMax = ch->xMax;
510 break;
511 case 1:
512 ch = (TextChar *)chars->get(start);
513 xMin = ch->xMin;
514 xMax = ch->xMax;
515 yMin = ch->yMin;
516 ch = (TextChar *)chars->get(start + len - 1);
517 yMax = ch->yMax;
518 break;
519 case 2:
520 ch = (TextChar *)chars->get(start);
521 xMax = ch->xMax;
522 yMin = ch->yMin;
523 yMax = ch->yMax;
524 ch = (TextChar *)chars->get(start + len - 1);
525 xMin = ch->xMin;
526 break;
527 case 3:
528 ch = (TextChar *)chars->get(start);
529 xMin = ch->xMin;
530 xMax = ch->xMax;
531 yMax = ch->yMax;
532 ch = (TextChar *)chars->get(start + len - 1);
533 yMin = ch->yMin;
534 break;
535 }
536 for (i = 0; i < len; ++i) {
537 ch = (TextChar *)chars->get(rot >= 2 ? start + len - 1 - i : start + i);
538 text[i] = ch->c;
539 charPos[i] = ch->charPos;
540 if (i == len - 1) {
541 charPos[len] = ch->charPos + ch->charLen;
542 }
543 switch (rot) {
544 case 0:
545 default:
546 edge[i] = ch->xMin;
547 if (i == len - 1) {
548 edge[len] = ch->xMax;
549 }
550 break;
551 case 1:
552 edge[i] = ch->yMin;
553 if (i == len - 1) {
554 edge[len] = ch->yMax;
555 }
556 break;
557 case 2:
558 edge[i] = ch->xMax;
559 if (i == len - 1) {
560 edge[len] = ch->xMin;
561 }
562 break;
563 case 3:
564 edge[i] = ch->yMax;
565 if (i == len - 1) {
566 edge[len] = ch->yMin;
567 }
568 break;
569 }
570 }
571 ch = (TextChar *)chars->get(start);
572 font = ch->font;
573 fontSize = ch->fontSize;
574 spaceAfter = spaceAfterA;
575 underlined = gFalse;
576 link = NULL;
577 colorR = ch->colorR;
578 colorG = ch->colorG;
579 colorB = ch->colorB;
580 invisible = ch->invisible;
581 }
582
TextWord(TextWord * word)583 TextWord::TextWord(TextWord *word) {
584 *this = *word;
585 text = (Unicode *)gmallocn(len, sizeof(Unicode));
586 memcpy(text, word->text, len * sizeof(Unicode));
587 edge = (double *)gmallocn(len + 1, sizeof(double));
588 memcpy(edge, word->edge, (len + 1) * sizeof(double));
589 charPos = (int *)gmallocn(len + 1, sizeof(int));
590 memcpy(charPos, word->charPos, (len + 1) * sizeof(int));
591 }
592
~TextWord()593 TextWord::~TextWord() {
594 gfree(text);
595 gfree(edge);
596 gfree(charPos);
597 }
598
599 // This is used to append a clipped character to a word.
appendChar(TextChar * ch)600 void TextWord::appendChar(TextChar *ch) {
601 if (ch->xMin < xMin) {
602 xMin = ch->xMin;
603 }
604 if (ch->xMax > xMax) {
605 xMax = ch->xMax;
606 }
607 if (ch->yMin < yMin) {
608 yMin = ch->yMin;
609 }
610 if (ch->yMax > yMax) {
611 yMax = ch->yMax;
612 }
613 text = (Unicode *)greallocn(text, len + 1, sizeof(Unicode));
614 edge = (double *)greallocn(edge, len + 2, sizeof(double));
615 charPos = (int *)greallocn(charPos, len + 2, sizeof(int));
616 text[len] = ch->c;
617 charPos[len] = ch->charPos;
618 charPos[len+1] = ch->charPos + ch->charLen;
619 switch (rot) {
620 case 0:
621 default:
622 edge[len] = ch->xMin;
623 edge[len+1] = ch->xMax;
624 break;
625 case 1:
626 edge[len] = ch->yMin;
627 edge[len+1] = ch->yMax;
628 break;
629 case 2:
630 edge[len] = ch->xMax;
631 edge[len+1] = ch->xMin;
632 break;
633 case 3:
634 edge[len] = ch->yMax;
635 edge[len+1] = ch->yMin;
636 break;
637 }
638 ++len;
639 }
640
cmpYX(const void * p1,const void * p2)641 int TextWord::cmpYX(const void *p1, const void *p2) {
642 const TextWord *word1 = *(const TextWord **)p1;
643 const TextWord *word2 = *(const TextWord **)p2;
644 double cmp;
645
646 if ((cmp = word1->yMin - word2->yMin) == 0) {
647 cmp = word1->xMin - word2->xMin;
648 }
649 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
650 }
651
cmpCharPos(const void * p1,const void * p2)652 int TextWord::cmpCharPos(const void *p1, const void *p2) {
653 const TextWord *word1 = *(const TextWord **)p1;
654 const TextWord *word2 = *(const TextWord **)p2;
655
656 return word1->charPos[0] - word2->charPos[0];
657 }
658
getText()659 GString *TextWord::getText() {
660 GString *s;
661 UnicodeMap *uMap;
662 char buf[8];
663 int n, i;
664
665 s = new GString();
666 if (!(uMap = globalParams->getTextEncoding())) {
667 return s;
668 }
669 for (i = 0; i < len; ++i) {
670 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
671 s->append(buf, n);
672 }
673 uMap->decRefCnt();
674 return s;
675 }
676
getCharBBox(int charIdx,double * xMinA,double * yMinA,double * xMaxA,double * yMaxA)677 void TextWord::getCharBBox(int charIdx, double *xMinA, double *yMinA,
678 double *xMaxA, double *yMaxA) {
679 if (charIdx < 0 || charIdx >= len) {
680 return;
681 }
682 switch (rot) {
683 case 0:
684 *xMinA = edge[charIdx];
685 *xMaxA = edge[charIdx + 1];
686 *yMinA = yMin;
687 *yMaxA = yMax;
688 break;
689 case 1:
690 *xMinA = xMin;
691 *xMaxA = xMax;
692 *yMinA = edge[charIdx];
693 *yMaxA = edge[charIdx + 1];
694 break;
695 case 2:
696 *xMinA = edge[charIdx + 1];
697 *xMaxA = edge[charIdx];
698 *yMinA = yMin;
699 *yMaxA = yMax;
700 break;
701 case 3:
702 *xMinA = xMin;
703 *xMaxA = xMax;
704 *yMinA = edge[charIdx + 1];
705 *yMaxA = edge[charIdx];
706 break;
707 }
708 }
709
getBaseline()710 double TextWord::getBaseline() {
711 switch (rot) {
712 case 0:
713 default:
714 return yMax + fontSize * font->descent;
715 case 1:
716 return xMin - fontSize * font->descent;
717 case 2:
718 return yMin - fontSize * font->descent;
719 case 3:
720 return xMax + fontSize * font->descent;
721 }
722 }
723
getLinkURI()724 GString *TextWord::getLinkURI() {
725 return link ? link->uri : (GString *)NULL;
726 }
727
728 //------------------------------------------------------------------------
729 // TextLine
730 //------------------------------------------------------------------------
731
TextLine(GList * wordsA,double xMinA,double yMinA,double xMaxA,double yMaxA,double fontSizeA)732 TextLine::TextLine(GList *wordsA, double xMinA, double yMinA,
733 double xMaxA, double yMaxA, double fontSizeA) {
734 TextWord *word;
735 int i, j, k;
736
737 words = wordsA;
738 rot = 0;
739 xMin = xMinA;
740 yMin = yMinA;
741 xMax = xMaxA;
742 yMax = yMaxA;
743 fontSize = fontSizeA;
744 px = 0;
745 pw = 0;
746
747 // build the text
748 len = 0;
749 for (i = 0; i < words->getLength(); ++i) {
750 word = (TextWord *)words->get(i);
751 len += word->len;
752 if (word->spaceAfter) {
753 ++len;
754 }
755 }
756 text = (Unicode *)gmallocn(len, sizeof(Unicode));
757 edge = (double *)gmallocn(len + 1, sizeof(double));
758 j = 0;
759 for (i = 0; i < words->getLength(); ++i) {
760 word = (TextWord *)words->get(i);
761 if (i == 0) {
762 rot = word->rot;
763 }
764 for (k = 0; k < word->len; ++k) {
765 text[j] = word->text[k];
766 edge[j] = word->edge[k];
767 ++j;
768 }
769 edge[j] = word->edge[word->len];
770 if (word->spaceAfter) {
771 text[j] = (Unicode)0x0020;
772 ++j;
773 edge[j] = edge[j - 1];
774 }
775 }
776 //~ need to check for other Unicode chars used as hyphens
777 hyphenated = text[len - 1] == (Unicode)'-';
778 }
779
~TextLine()780 TextLine::~TextLine() {
781 deleteGList(words, TextWord);
782 gfree(text);
783 gfree(edge);
784 }
785
getBaseline()786 double TextLine::getBaseline() {
787 TextWord *word0;
788
789 word0 = (TextWord *)words->get(0);
790 switch (rot) {
791 case 0:
792 default:
793 return yMax + fontSize * word0->font->descent;
794 case 1:
795 return xMin - fontSize * word0->font->descent;
796 case 2:
797 return yMin - fontSize * word0->font->descent;
798 case 3:
799 return xMax + fontSize * word0->font->descent;
800 }
801 }
802
803 //------------------------------------------------------------------------
804 // TextParagraph
805 //------------------------------------------------------------------------
806
TextParagraph(GList * linesA)807 TextParagraph::TextParagraph(GList *linesA) {
808 TextLine *line;
809 int i;
810
811 lines = linesA;
812 xMin = yMin = xMax = yMax = 0;
813 for (i = 0; i < lines->getLength(); ++i) {
814 line = (TextLine *)lines->get(i);
815 if (i == 0 || line->xMin < xMin) {
816 xMin = line->xMin;
817 }
818 if (i == 0 || line->yMin < yMin) {
819 yMin = line->yMin;
820 }
821 if (i == 0 || line->xMax > xMax) {
822 xMax = line->xMax;
823 }
824 if (i == 0 || line->yMax > yMax) {
825 yMax = line->yMax;
826 }
827 }
828 }
829
~TextParagraph()830 TextParagraph::~TextParagraph() {
831 deleteGList(lines, TextLine);
832 }
833
834 //------------------------------------------------------------------------
835 // TextColumn
836 //------------------------------------------------------------------------
837
TextColumn(GList * paragraphsA,double xMinA,double yMinA,double xMaxA,double yMaxA)838 TextColumn::TextColumn(GList *paragraphsA, double xMinA, double yMinA,
839 double xMaxA, double yMaxA) {
840 paragraphs = paragraphsA;
841 xMin = xMinA;
842 yMin = yMinA;
843 xMax = xMaxA;
844 yMax = yMaxA;
845 px = py = 0;
846 pw = ph = 0;
847 }
848
~TextColumn()849 TextColumn::~TextColumn() {
850 deleteGList(paragraphs, TextParagraph);
851 }
852
cmpX(const void * p1,const void * p2)853 int TextColumn::cmpX(const void *p1, const void *p2) {
854 const TextColumn *col1 = *(const TextColumn **)p1;
855 const TextColumn *col2 = *(const TextColumn **)p2;
856
857 if (col1->xMin < col2->xMin) {
858 return -1;
859 } else if (col1->xMin > col2->xMin) {
860 return 1;
861 } else {
862 return 0;
863 }
864 }
865
cmpY(const void * p1,const void * p2)866 int TextColumn::cmpY(const void *p1, const void *p2) {
867 const TextColumn *col1 = *(const TextColumn **)p1;
868 const TextColumn *col2 = *(const TextColumn **)p2;
869
870 if (col1->yMin < col2->yMin) {
871 return -1;
872 } else if (col1->yMin > col2->yMin) {
873 return 1;
874 } else {
875 return 0;
876 }
877 }
878
cmpPX(const void * p1,const void * p2)879 int TextColumn::cmpPX(const void *p1, const void *p2) {
880 const TextColumn *col1 = *(const TextColumn **)p1;
881 const TextColumn *col2 = *(const TextColumn **)p2;
882
883 if (col1->px < col2->px) {
884 return -1;
885 } else if (col1->px > col2->px) {
886 return 1;
887 } else {
888 return 0;
889 }
890 }
891
892 //------------------------------------------------------------------------
893 // TextWordList
894 //------------------------------------------------------------------------
895
TextWordList(GList * wordsA)896 TextWordList::TextWordList(GList *wordsA) {
897 words = wordsA;
898 }
899
~TextWordList()900 TextWordList::~TextWordList() {
901 deleteGList(words, TextWord);
902 }
903
getLength()904 int TextWordList::getLength() {
905 return words->getLength();
906 }
907
get(int idx)908 TextWord *TextWordList::get(int idx) {
909 if (idx < 0 || idx >= words->getLength()) {
910 return NULL;
911 }
912 return (TextWord *)words->get(idx);
913 }
914
915 //------------------------------------------------------------------------
916 // TextPage
917 //------------------------------------------------------------------------
918
TextPage(TextOutputControl * controlA)919 TextPage::TextPage(TextOutputControl *controlA) {
920 control = *controlA;
921 pageWidth = pageHeight = 0;
922 charPos = 0;
923 curFont = NULL;
924 curFontSize = 0;
925 curRot = 0;
926 nTinyChars = 0;
927 actualText = NULL;
928 actualTextLen = 0;
929 actualTextX0 = 0;
930 actualTextY0 = 0;
931 actualTextX1 = 0;
932 actualTextY1 = 0;
933 actualTextNBytes = 0;
934
935 chars = new GList();
936 fonts = new GList();
937
938 underlines = new GList();
939 links = new GList();
940
941 findCols = NULL;
942 findLR = gTrue;
943 lastFindXMin = lastFindYMin = 0;
944 haveLastFind = gFalse;
945 }
946
~TextPage()947 TextPage::~TextPage() {
948 clear();
949 deleteGList(chars, TextChar);
950 deleteGList(fonts, TextFontInfo);
951 deleteGList(underlines, TextUnderline);
952 deleteGList(links, TextLink);
953 if (findCols) {
954 deleteGList(findCols, TextColumn);
955 }
956 }
957
startPage(GfxState * state)958 void TextPage::startPage(GfxState *state) {
959 clear();
960 if (state) {
961 pageWidth = state->getPageWidth();
962 pageHeight = state->getPageHeight();
963 } else {
964 pageWidth = pageHeight = 0;
965 }
966 }
967
clear()968 void TextPage::clear() {
969 pageWidth = pageHeight = 0;
970 charPos = 0;
971 curFont = NULL;
972 curFontSize = 0;
973 curRot = 0;
974 nTinyChars = 0;
975 gfree(actualText);
976 actualText = NULL;
977 actualTextLen = 0;
978 actualTextNBytes = 0;
979 deleteGList(chars, TextChar);
980 chars = new GList();
981 deleteGList(fonts, TextFontInfo);
982 fonts = new GList();
983 deleteGList(underlines, TextUnderline);
984 underlines = new GList();
985 deleteGList(links, TextLink);
986 links = new GList();
987
988 if (findCols) {
989 deleteGList(findCols, TextColumn);
990 findCols = NULL;
991 }
992 findLR = gTrue;
993 lastFindXMin = lastFindYMin = 0;
994 haveLastFind = gFalse;
995 }
996
updateFont(GfxState * state)997 void TextPage::updateFont(GfxState *state) {
998 GfxFont *gfxFont;
999 double *fm;
1000 char *name;
1001 int code, mCode, letterCode, anyCode;
1002 double w;
1003 double m[4], m2[4];
1004 int i;
1005
1006 // get the font info object
1007 curFont = NULL;
1008 for (i = 0; i < fonts->getLength(); ++i) {
1009 curFont = (TextFontInfo *)fonts->get(i);
1010 if (curFont->matches(state)) {
1011 break;
1012 }
1013 curFont = NULL;
1014 }
1015 if (!curFont) {
1016 curFont = new TextFontInfo(state);
1017 fonts->append(curFont);
1018 }
1019
1020 // adjust the font size
1021 gfxFont = state->getFont();
1022 curFontSize = state->getTransformedFontSize();
1023 if (gfxFont && gfxFont->getType() == fontType3) {
1024 // This is a hack which makes it possible to deal with some Type 3
1025 // fonts. The problem is that it's impossible to know what the
1026 // base coordinate system used in the font is without actually
1027 // rendering the font. This code tries to guess by looking at the
1028 // width of the character 'm' (which breaks if the font is a
1029 // subset that doesn't contain 'm').
1030 mCode = letterCode = anyCode = -1;
1031 for (code = 0; code < 256; ++code) {
1032 name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
1033 if (name && name[0] == 'm' && name[1] == '\0') {
1034 mCode = code;
1035 }
1036 if (letterCode < 0 && name && name[1] == '\0' &&
1037 ((name[0] >= 'A' && name[0] <= 'Z') ||
1038 (name[0] >= 'a' && name[0] <= 'z'))) {
1039 letterCode = code;
1040 }
1041 if (anyCode < 0 && name &&
1042 ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
1043 anyCode = code;
1044 }
1045 }
1046 if (mCode >= 0 &&
1047 (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
1048 // 0.6 is a generic average 'm' width -- yes, this is a hack
1049 curFontSize *= w / 0.6;
1050 } else if (letterCode >= 0 &&
1051 (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
1052 // even more of a hack: 0.5 is a generic letter width
1053 curFontSize *= w / 0.5;
1054 } else if (anyCode >= 0 &&
1055 (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
1056 // better than nothing: 0.5 is a generic character width
1057 curFontSize *= w / 0.5;
1058 }
1059 fm = gfxFont->getFontMatrix();
1060 if (fm[0] != 0) {
1061 curFontSize *= fabs(fm[3] / fm[0]);
1062 }
1063 }
1064
1065 // compute the rotation
1066 state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
1067 if (gfxFont && gfxFont->getType() == fontType3) {
1068 fm = gfxFont->getFontMatrix();
1069 m2[0] = fm[0] * m[0] + fm[1] * m[2];
1070 m2[1] = fm[0] * m[1] + fm[1] * m[3];
1071 m2[2] = fm[2] * m[0] + fm[3] * m[2];
1072 m2[3] = fm[2] * m[1] + fm[3] * m[3];
1073 m[0] = m2[0];
1074 m[1] = m2[1];
1075 m[2] = m2[2];
1076 m[3] = m2[3];
1077 }
1078 if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
1079 curRot = (m[0] > 0 || m[3] < 0) ? 0 : 2;
1080 } else {
1081 curRot = (m[2] > 0) ? 1 : 3;
1082 }
1083 }
1084
addChar(GfxState * state,double x,double y,double dx,double dy,CharCode c,int nBytes,Unicode * u,int uLen)1085 void TextPage::addChar(GfxState *state, double x, double y,
1086 double dx, double dy,
1087 CharCode c, int nBytes, Unicode *u, int uLen) {
1088 double x1, y1, x2, y2, w1, h1, dx2, dy2, ascent, descent, sp;
1089 double xMin, yMin, xMax, yMax;
1090 double clipXMin, clipYMin, clipXMax, clipYMax;
1091 GfxRGB rgb;
1092 GBool clipped, rtl;
1093 int i, j;
1094
1095 // if we're in an ActualText span, save the position info (the
1096 // ActualText chars will be added by TextPage::endActualText()).
1097 if (actualText) {
1098 if (!actualTextNBytes) {
1099 actualTextX0 = x;
1100 actualTextY0 = y;
1101 }
1102 actualTextX1 = x + dx;
1103 actualTextY1 = y + dy;
1104 actualTextNBytes += nBytes;
1105 return;
1106 }
1107
1108 // subtract char and word spacing from the dx,dy values
1109 sp = state->getCharSpace();
1110 if (c == (CharCode)0x20) {
1111 sp += state->getWordSpace();
1112 }
1113 state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
1114 dx -= dx2;
1115 dy -= dy2;
1116 state->transformDelta(dx, dy, &w1, &h1);
1117
1118 // throw away chars that aren't inside the page bounds
1119 // (and also do a sanity check on the character size)
1120 state->transform(x, y, &x1, &y1);
1121 if (x1 + w1 < 0 || x1 > pageWidth ||
1122 y1 + h1 < 0 || y1 > pageHeight ||
1123 w1 > pageWidth || h1 > pageHeight) {
1124 charPos += nBytes;
1125 return;
1126 }
1127
1128 // check the tiny chars limit
1129 if (!globalParams->getTextKeepTinyChars() &&
1130 fabs(w1) < 3 && fabs(h1) < 3) {
1131 if (++nTinyChars > 50000) {
1132 charPos += nBytes;
1133 return;
1134 }
1135 }
1136
1137 // skip space characters
1138 if (uLen == 1 && u[0] == (Unicode)0x20) {
1139 charPos += nBytes;
1140 return;
1141 }
1142
1143 // check for clipping
1144 clipped = gFalse;
1145 if (control.clipText) {
1146 state->getClipBBox(&clipXMin, &clipYMin, &clipXMax, &clipYMax);
1147 if (x1 + 0.1 * w1 < clipXMin || x1 + 0.9 * w1 > clipXMax ||
1148 y1 + 0.1 * h1 < clipYMin || y1 + 0.9 * h1 > clipYMax) {
1149 clipped = gTrue;
1150 }
1151 }
1152
1153 // add the characters
1154 if (uLen > 0) {
1155
1156 // handle right-to-left ligatures: if there are multiple Unicode
1157 // characters, and they're all right-to-left, insert them in
1158 // right-to-left order
1159 if (uLen > 1) {
1160 rtl = gTrue;
1161 for (i = 0; i < uLen; ++i) {
1162 if (!unicodeTypeR(u[i])) {
1163 rtl = gFalse;
1164 break;
1165 }
1166 }
1167 } else {
1168 rtl = gFalse;
1169 }
1170
1171 w1 /= uLen;
1172 h1 /= uLen;
1173 ascent = curFont->ascent * curFontSize;
1174 descent = curFont->descent * curFontSize;
1175 for (i = 0; i < uLen; ++i) {
1176 x2 = x1 + i * w1;
1177 y2 = y1 + i * h1;
1178 switch (curRot) {
1179 case 0:
1180 default:
1181 xMin = x2;
1182 xMax = x2 + w1;
1183 yMin = y2 - ascent;
1184 yMax = y2 - descent;
1185 break;
1186 case 1:
1187 xMin = x2 + descent;
1188 xMax = x2 + ascent;
1189 yMin = y2;
1190 yMax = y2 + h1;
1191 break;
1192 case 2:
1193 xMin = x2 + w1;
1194 xMax = x2;
1195 yMin = y2 + descent;
1196 yMax = y2 + ascent;
1197 break;
1198 case 3:
1199 xMin = x2 - ascent;
1200 xMax = x2 - descent;
1201 yMin = y2 + h1;
1202 yMax = y2;
1203 break;
1204 }
1205 if ((state->getRender() & 3) == 1) {
1206 state->getStrokeRGB(&rgb);
1207 } else {
1208 state->getFillRGB(&rgb);
1209 }
1210 if (rtl) {
1211 j = uLen - 1 - i;
1212 } else {
1213 j = i;
1214 }
1215 chars->append(new TextChar(u[j], charPos, nBytes, xMin, yMin, xMax, yMax,
1216 curRot, clipped,
1217 state->getRender() == 3,
1218 curFont, curFontSize,
1219 colToDbl(rgb.r), colToDbl(rgb.g),
1220 colToDbl(rgb.b)));
1221 }
1222 }
1223
1224 charPos += nBytes;
1225 }
1226
incCharCount(int nChars)1227 void TextPage::incCharCount(int nChars) {
1228 charPos += nChars;
1229 }
1230
beginActualText(GfxState * state,Unicode * u,int uLen)1231 void TextPage::beginActualText(GfxState *state, Unicode *u, int uLen) {
1232 if (actualText) {
1233 gfree(actualText);
1234 }
1235 actualText = (Unicode *)gmallocn(uLen, sizeof(Unicode));
1236 memcpy(actualText, u, uLen * sizeof(Unicode));
1237 actualTextLen = uLen;
1238 actualTextNBytes = 0;
1239 }
1240
endActualText(GfxState * state)1241 void TextPage::endActualText(GfxState *state) {
1242 Unicode *u;
1243
1244 u = actualText;
1245 actualText = NULL; // so we can call TextPage::addChar()
1246 if (actualTextNBytes) {
1247 // now that we have the position info for all of the text inside
1248 // the marked content span, we feed the "ActualText" back through
1249 // addChar()
1250 addChar(state, actualTextX0, actualTextY0,
1251 actualTextX1 - actualTextX0, actualTextY1 - actualTextY0,
1252 0, actualTextNBytes, u, actualTextLen);
1253 }
1254 gfree(u);
1255 actualText = NULL;
1256 actualTextLen = 0;
1257 actualTextNBytes = gFalse;
1258 }
1259
addUnderline(double x0,double y0,double x1,double y1)1260 void TextPage::addUnderline(double x0, double y0, double x1, double y1) {
1261 underlines->append(new TextUnderline(x0, y0, x1, y1));
1262 }
1263
addLink(double xMin,double yMin,double xMax,double yMax,Link * link)1264 void TextPage::addLink(double xMin, double yMin, double xMax, double yMax,
1265 Link *link) {
1266 GString *uri;
1267
1268 if (link && link->getAction() && link->getAction()->getKind() == actionURI) {
1269 uri = ((LinkURI *)link->getAction())->getURI()->copy();
1270 links->append(new TextLink(xMin, yMin, xMax, yMax, uri));
1271 }
1272 }
1273
1274 //------------------------------------------------------------------------
1275 // TextPage: output
1276 //------------------------------------------------------------------------
1277
write(void * outputStream,TextOutputFunc outputFunc)1278 void TextPage::write(void *outputStream, TextOutputFunc outputFunc) {
1279 UnicodeMap *uMap;
1280 char space[8], eol[16], eop[8];
1281 int spaceLen, eolLen, eopLen;
1282 GBool pageBreaks;
1283
1284 // get the output encoding
1285 if (!(uMap = globalParams->getTextEncoding())) {
1286 return;
1287 }
1288 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
1289 eolLen = 0; // make gcc happy
1290 switch (globalParams->getTextEOL()) {
1291 case eolUnix:
1292 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
1293 break;
1294 case eolDOS:
1295 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
1296 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
1297 break;
1298 case eolMac:
1299 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
1300 break;
1301 }
1302 eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
1303 pageBreaks = globalParams->getTextPageBreaks();
1304
1305 switch (control.mode) {
1306 case textOutReadingOrder:
1307 writeReadingOrder(outputStream, outputFunc, uMap, space, spaceLen,
1308 eol, eolLen);
1309 break;
1310 case textOutPhysLayout:
1311 case textOutTableLayout:
1312 writePhysLayout(outputStream, outputFunc, uMap, space, spaceLen,
1313 eol, eolLen);
1314 break;
1315 case textOutLinePrinter:
1316 writeLinePrinter(outputStream, outputFunc, uMap, space, spaceLen,
1317 eol, eolLen);
1318 break;
1319 case textOutRawOrder:
1320 writeRaw(outputStream, outputFunc, uMap, space, spaceLen,
1321 eol, eolLen);
1322 break;
1323 }
1324
1325 // end of page
1326 if (pageBreaks) {
1327 (*outputFunc)(outputStream, eop, eopLen);
1328 }
1329
1330 uMap->decRefCnt();
1331 }
1332
writeReadingOrder(void * outputStream,TextOutputFunc outputFunc,UnicodeMap * uMap,char * space,int spaceLen,char * eol,int eolLen)1333 void TextPage::writeReadingOrder(void *outputStream,
1334 TextOutputFunc outputFunc,
1335 UnicodeMap *uMap,
1336 char *space, int spaceLen,
1337 char *eol, int eolLen) {
1338 TextBlock *tree;
1339 TextColumn *col;
1340 TextParagraph *par;
1341 TextLine *line;
1342 GList *columns;
1343 GBool primaryLR;
1344 GString *s;
1345 int colIdx, parIdx, lineIdx, rot, n;
1346
1347 rot = rotateChars(chars);
1348 primaryLR = checkPrimaryLR(chars);
1349 tree = splitChars(chars);
1350 #if 0 //~debug
1351 dumpTree(tree);
1352 #endif
1353 if (!tree) {
1354 // no text
1355 unrotateChars(chars, rot);
1356 return;
1357 }
1358 columns = buildColumns(tree);
1359 delete tree;
1360 unrotateChars(chars, rot);
1361 if (control.html) {
1362 rotateUnderlinesAndLinks(rot);
1363 generateUnderlinesAndLinks(columns);
1364 }
1365 #if 0 //~debug
1366 dumpColumns(columns);
1367 #endif
1368
1369 for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
1370 col = (TextColumn *)columns->get(colIdx);
1371 for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
1372 par = (TextParagraph *)col->paragraphs->get(parIdx);
1373 for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
1374 line = (TextLine *)par->lines->get(lineIdx);
1375 n = line->len;
1376 if (line->hyphenated && lineIdx + 1 < par->lines->getLength()) {
1377 --n;
1378 }
1379 s = new GString();
1380 encodeFragment(line->text, n, uMap, primaryLR, s);
1381 if (lineIdx + 1 < par->lines->getLength() && !line->hyphenated) {
1382 s->append(space, spaceLen);
1383 }
1384 (*outputFunc)(outputStream, s->getCString(), s->getLength());
1385 delete s;
1386 }
1387 (*outputFunc)(outputStream, eol, eolLen);
1388 }
1389 (*outputFunc)(outputStream, eol, eolLen);
1390 }
1391
1392 deleteGList(columns, TextColumn);
1393 }
1394
makeColumns()1395 GList *TextPage::makeColumns() {
1396 TextBlock *tree;
1397 GList *columns;
1398
1399 tree = splitChars(chars);
1400 if (!tree) {
1401 // no text
1402 return new GList();
1403 }
1404 columns = buildColumns(tree);
1405 delete tree;
1406 if (control.html) {
1407 generateUnderlinesAndLinks(columns);
1408 }
1409 return columns;
1410 }
1411
1412 // This handles both physical layout and table layout modes.
writePhysLayout(void * outputStream,TextOutputFunc outputFunc,UnicodeMap * uMap,char * space,int spaceLen,char * eol,int eolLen)1413 void TextPage::writePhysLayout(void *outputStream,
1414 TextOutputFunc outputFunc,
1415 UnicodeMap *uMap,
1416 char *space, int spaceLen,
1417 char *eol, int eolLen) {
1418 TextBlock *tree;
1419 GString **out;
1420 int *outLen;
1421 TextColumn *col;
1422 TextParagraph *par;
1423 TextLine *line;
1424 GList *columns;
1425 GBool primaryLR;
1426 int ph, colIdx, parIdx, lineIdx, rot, y, i;
1427
1428 #if 0 //~debug
1429 dumpChars(chars);
1430 #endif
1431 rot = rotateChars(chars);
1432 primaryLR = checkPrimaryLR(chars);
1433 tree = splitChars(chars);
1434 #if 0 //~debug
1435 dumpTree(tree);
1436 #endif
1437 if (!tree) {
1438 // no text
1439 unrotateChars(chars, rot);
1440 return;
1441 }
1442 columns = buildColumns(tree);
1443 delete tree;
1444 unrotateChars(chars, rot);
1445 if (control.html) {
1446 rotateUnderlinesAndLinks(rot);
1447 generateUnderlinesAndLinks(columns);
1448 }
1449 ph = assignPhysLayoutPositions(columns);
1450 #if 0 //~debug
1451 dumpColumns(columns);
1452 #endif
1453
1454 out = (GString **)gmallocn(ph, sizeof(GString *));
1455 outLen = (int *)gmallocn(ph, sizeof(int));
1456 for (i = 0; i < ph; ++i) {
1457 out[i] = NULL;
1458 outLen[i] = 0;
1459 }
1460
1461 columns->sort(&TextColumn::cmpPX);
1462 for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
1463 col = (TextColumn *)columns->get(colIdx);
1464 y = col->py;
1465 for (parIdx = 0;
1466 parIdx < col->paragraphs->getLength() && y < ph;
1467 ++parIdx) {
1468 par = (TextParagraph *)col->paragraphs->get(parIdx);
1469 for (lineIdx = 0;
1470 lineIdx < par->lines->getLength() && y < ph;
1471 ++lineIdx) {
1472 line = (TextLine *)par->lines->get(lineIdx);
1473 if (!out[y]) {
1474 out[y] = new GString();
1475 }
1476 while (outLen[y] < col->px + line->px) {
1477 out[y]->append(space, spaceLen);
1478 ++outLen[y];
1479 }
1480 encodeFragment(line->text, line->len, uMap, primaryLR, out[y]);
1481 outLen[y] += line->pw;
1482 ++y;
1483 }
1484 if (parIdx + 1 < col->paragraphs->getLength()) {
1485 ++y;
1486 }
1487 }
1488 }
1489
1490 for (i = 0; i < ph; ++i) {
1491 if (out[i]) {
1492 (*outputFunc)(outputStream, out[i]->getCString(), out[i]->getLength());
1493 delete out[i];
1494 }
1495 (*outputFunc)(outputStream, eol, eolLen);
1496 }
1497
1498 gfree(out);
1499 gfree(outLen);
1500
1501 deleteGList(columns, TextColumn);
1502 }
1503
writeLinePrinter(void * outputStream,TextOutputFunc outputFunc,UnicodeMap * uMap,char * space,int spaceLen,char * eol,int eolLen)1504 void TextPage::writeLinePrinter(void *outputStream,
1505 TextOutputFunc outputFunc,
1506 UnicodeMap *uMap,
1507 char *space, int spaceLen,
1508 char *eol, int eolLen) {
1509 TextChar *ch, *ch2;
1510 GList *line;
1511 GString *s;
1512 char buf[8];
1513 double pitch, lineSpacing, delta;
1514 double yMin0, yShift, xMin0, xShift;
1515 double y, x;
1516 int rot, n, i, j, k;
1517
1518 rot = rotateChars(chars);
1519 chars->sort(&TextChar::cmpX);
1520 removeDuplicates(chars, 0);
1521 chars->sort(&TextChar::cmpY);
1522
1523 // get character pitch
1524 if (control.fixedPitch > 0) {
1525 pitch = control.fixedPitch;
1526 } else {
1527 // compute (approximate) character pitch
1528 pitch = pageWidth;
1529 for (i = 0; i < chars->getLength(); ++i) {
1530 ch = (TextChar *)chars->get(i);
1531 for (j = i + 1; j < chars->getLength(); ++j) {
1532 ch2 = (TextChar *)chars->get(j);
1533 if (ch2->yMin + ascentAdjustFactor * (ch2->yMax - ch2->yMin) <
1534 ch->yMax - descentAdjustFactor * (ch->yMax - ch->yMin) &&
1535 ch->yMin + ascentAdjustFactor * (ch->yMax - ch->yMin) <
1536 ch2->yMax - descentAdjustFactor * (ch2->yMax - ch2->yMin)) {
1537 delta = fabs(ch2->xMin - ch->xMin);
1538 if (delta > 0 && delta < pitch) {
1539 pitch = delta;
1540 }
1541 }
1542 }
1543 }
1544 }
1545
1546 // get line spacing
1547 if (control.fixedLineSpacing > 0) {
1548 lineSpacing = control.fixedLineSpacing;
1549 } else {
1550 // compute (approximate) line spacing
1551 lineSpacing = pageHeight;
1552 i = 0;
1553 while (i < chars->getLength()) {
1554 ch = (TextChar *)chars->get(i);
1555 // look for the first char that does not (substantially)
1556 // vertically overlap this one
1557 delta = 0;
1558 for (++i; delta == 0 && i < chars->getLength(); ++i) {
1559 ch2 = (TextChar *)chars->get(i);
1560 if (ch2->yMin + ascentAdjustFactor * (ch2->yMax - ch2->yMin) >
1561 ch->yMax - descentAdjustFactor * (ch->yMax - ch->yMin)) {
1562 delta = ch2->yMin - ch->yMin;
1563 }
1564 }
1565 if (delta > 0 && delta < lineSpacing) {
1566 lineSpacing = delta;
1567 }
1568 }
1569 }
1570
1571 // shift the grid to avoid problems with floating point accuracy --
1572 // for fixed line spacing, this avoids problems with
1573 // dropping/inserting blank lines
1574 if (chars->getLength()) {
1575 yMin0 = ((TextChar *)chars->get(0))->yMin;
1576 yShift = yMin0 - (int)(yMin0 / lineSpacing + 0.5) * lineSpacing
1577 - 0.5 * lineSpacing;
1578 } else {
1579 yShift = 0;
1580 }
1581
1582 // for each line...
1583 i = 0;
1584 j = chars->getLength() - 1;
1585 for (y = yShift; y < pageHeight; y += lineSpacing) {
1586
1587 // get the characters in this line
1588 line = new GList;
1589 while (i < chars->getLength() &&
1590 ((TextChar *)chars->get(i))->yMin < y + lineSpacing) {
1591 line->append(chars->get(i++));
1592 }
1593 line->sort(&TextChar::cmpX);
1594
1595 // shift the grid to avoid problems with floating point accuracy
1596 // -- for fixed char spacing, this avoids problems with
1597 // dropping/inserting spaces
1598 if (line->getLength()) {
1599 xMin0 = ((TextChar *)line->get(0))->xMin;
1600 xShift = xMin0 - (int)(xMin0 / pitch + 0.5) * pitch - 0.5 * pitch;
1601 } else {
1602 xShift = 0;
1603 }
1604
1605 // write the line
1606 s = new GString();
1607 x = xShift;
1608 k = 0;
1609 while (k < line->getLength()) {
1610 ch = (TextChar *)line->get(k);
1611 if (ch->xMin < x + pitch) {
1612 n = uMap->mapUnicode(ch->c, buf, sizeof(buf));
1613 s->append(buf, n);
1614 ++k;
1615 } else {
1616 s->append(space, spaceLen);
1617 n = spaceLen;
1618 }
1619 x += (uMap->isUnicode() ? 1 : n) * pitch;
1620 }
1621 s->append(eol, eolLen);
1622 (*outputFunc)(outputStream, s->getCString(), s->getLength());
1623 delete s;
1624 delete line;
1625 }
1626
1627 unrotateChars(chars, rot);
1628 }
1629
writeRaw(void * outputStream,TextOutputFunc outputFunc,UnicodeMap * uMap,char * space,int spaceLen,char * eol,int eolLen)1630 void TextPage::writeRaw(void *outputStream,
1631 TextOutputFunc outputFunc,
1632 UnicodeMap *uMap,
1633 char *space, int spaceLen,
1634 char *eol, int eolLen) {
1635 TextChar *ch, *ch2;
1636 GString *s;
1637 char buf[8];
1638 int n, i;
1639
1640 s = new GString();
1641
1642 for (i = 0; i < chars->getLength(); ++i) {
1643
1644 // process one char
1645 ch = (TextChar *)chars->get(i);
1646 n = uMap->mapUnicode(ch->c, buf, sizeof(buf));
1647 s->append(buf, n);
1648
1649 // check for space or eol
1650 if (i+1 < chars->getLength()) {
1651 ch2 = (TextChar *)chars->get(i+1);
1652 if (ch2->rot != ch->rot) {
1653 s->append(eol, eolLen);
1654 } else {
1655 switch (ch->rot) {
1656 case 0:
1657 default:
1658 if (fabs(ch2->yMin - ch->yMin) > rawModeLineDelta * ch->fontSize ||
1659 ch2->xMin - ch->xMax < -rawModeCharOverlap * ch->fontSize) {
1660 s->append(eol, eolLen);
1661 } else if (ch2->xMin - ch->xMax >
1662 rawModeWordSpacing * ch->fontSize) {
1663 s->append(space, spaceLen);
1664 }
1665 break;
1666 case 1:
1667 if (fabs(ch->xMax - ch2->xMax) > rawModeLineDelta * ch->fontSize ||
1668 ch2->yMin - ch->yMax < -rawModeCharOverlap * ch->fontSize) {
1669 s->append(eol, eolLen);
1670 } else if (ch2->yMin - ch->yMax >
1671 rawModeWordSpacing * ch->fontSize) {
1672 s->append(space, spaceLen);
1673 }
1674 break;
1675 case 2:
1676 if (fabs(ch->yMax - ch2->yMax) > rawModeLineDelta * ch->fontSize ||
1677 ch->xMin - ch2->xMax < -rawModeCharOverlap * ch->fontSize) {
1678 s->append(eol, eolLen);
1679 } else if (ch->xMin - ch2->xMax >
1680 rawModeWordSpacing * ch->fontSize) {
1681 s->append(space, spaceLen);
1682 }
1683 break;
1684 case 3:
1685 if (fabs(ch2->xMin - ch->xMin) > rawModeLineDelta * ch->fontSize ||
1686 ch->yMin - ch2->yMax < -rawModeCharOverlap * ch->fontSize) {
1687 s->append(eol, eolLen);
1688 } else if (ch->yMin - ch2->yMax >
1689 rawModeWordSpacing * ch->fontSize) {
1690 s->append(space, spaceLen);
1691 }
1692 break;
1693 }
1694 }
1695 } else {
1696 s->append(eol, eolLen);
1697 }
1698
1699 if (s->getLength() > 1000) {
1700 (*outputFunc)(outputStream, s->getCString(), s->getLength());
1701 s->clear();
1702 }
1703 }
1704
1705 if (s->getLength() > 0) {
1706 (*outputFunc)(outputStream, s->getCString(), s->getLength());
1707 }
1708 delete s;
1709 }
1710
encodeFragment(Unicode * text,int len,UnicodeMap * uMap,GBool primaryLR,GString * s)1711 void TextPage::encodeFragment(Unicode *text, int len, UnicodeMap *uMap,
1712 GBool primaryLR, GString *s) {
1713 char lre[8], rle[8], popdf[8], buf[8];
1714 int lreLen, rleLen, popdfLen, n;
1715 int i, j, k;
1716
1717 if (uMap->isUnicode()) {
1718
1719 lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
1720 rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
1721 popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
1722
1723 if (primaryLR) {
1724
1725 i = 0;
1726 while (i < len) {
1727 // output a left-to-right section
1728 for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
1729 for (k = i; k < j; ++k) {
1730 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
1731 s->append(buf, n);
1732 }
1733 i = j;
1734 // output a right-to-left section
1735 for (j = i;
1736 j < len && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
1737 ++j) ;
1738 if (j > i) {
1739 s->append(rle, rleLen);
1740 for (k = j - 1; k >= i; --k) {
1741 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
1742 s->append(buf, n);
1743 }
1744 s->append(popdf, popdfLen);
1745 i = j;
1746 }
1747 }
1748
1749 } else {
1750
1751 // Note: This code treats numeric characters (European and
1752 // Arabic/Indic) as left-to-right, which isn't strictly correct
1753 // (incurs extra LRE/POPDF pairs), but does produce correct
1754 // visual formatting.
1755 s->append(rle, rleLen);
1756 i = len - 1;
1757 while (i >= 0) {
1758 // output a right-to-left section
1759 for (j = i;
1760 j >= 0 && !(unicodeTypeL(text[j]) || unicodeTypeNum(text[j]));
1761 --j) ;
1762 for (k = i; k > j; --k) {
1763 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
1764 s->append(buf, n);
1765 }
1766 i = j;
1767 // output a left-to-right section
1768 for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
1769 if (j < i) {
1770 s->append(lre, lreLen);
1771 for (k = j + 1; k <= i; ++k) {
1772 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
1773 s->append(buf, n);
1774 }
1775 s->append(popdf, popdfLen);
1776 i = j;
1777 }
1778 }
1779 s->append(popdf, popdfLen);
1780 }
1781
1782 } else {
1783 for (i = 0; i < len; ++i) {
1784 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
1785 s->append(buf, n);
1786 }
1787 }
1788 }
1789
1790 //------------------------------------------------------------------------
1791 // TextPage: layout analysis
1792 //------------------------------------------------------------------------
1793
1794 // Determine primary (most common) rotation value. Rotate all chars
1795 // to that primary rotation.
rotateChars(GList * charsA)1796 int TextPage::rotateChars(GList *charsA) {
1797 TextChar *ch;
1798 int nChars[4];
1799 double xMin, yMin, xMax, yMax, t;
1800 int rot, i;
1801
1802 // determine primary rotation
1803 nChars[0] = nChars[1] = nChars[2] = nChars[3] = 0;
1804 for (i = 0; i < charsA->getLength(); ++i) {
1805 ch = (TextChar *)charsA->get(i);
1806 ++nChars[ch->rot];
1807 }
1808 rot = 0;
1809 for (i = 1; i < 4; ++i) {
1810 if (nChars[i] > nChars[rot]) {
1811 rot = i;
1812 }
1813 }
1814
1815 // rotate
1816 switch (rot) {
1817 case 0:
1818 default:
1819 break;
1820 case 1:
1821 for (i = 0; i < charsA->getLength(); ++i) {
1822 ch = (TextChar *)charsA->get(i);
1823 xMin = ch->yMin;
1824 xMax = ch->yMax;
1825 yMin = pageWidth - ch->xMax;
1826 yMax = pageWidth - ch->xMin;
1827 ch->xMin = xMin;
1828 ch->xMax = xMax;
1829 ch->yMin = yMin;
1830 ch->yMax = yMax;
1831 ch->rot = (ch->rot + 3) & 3;
1832 }
1833 t = pageWidth;
1834 pageWidth = pageHeight;
1835 pageHeight = t;
1836 break;
1837 case 2:
1838 for (i = 0; i < charsA->getLength(); ++i) {
1839 ch = (TextChar *)charsA->get(i);
1840 xMin = pageWidth - ch->xMax;
1841 xMax = pageWidth - ch->xMin;
1842 yMin = pageHeight - ch->yMax;
1843 yMax = pageHeight - ch->yMin;
1844 ch->xMin = xMin;
1845 ch->xMax = xMax;
1846 ch->yMin = yMin;
1847 ch->yMax = yMax;
1848 ch->rot = (ch->rot + 2) & 3;
1849 }
1850 break;
1851 case 3:
1852 for (i = 0; i < charsA->getLength(); ++i) {
1853 ch = (TextChar *)charsA->get(i);
1854 xMin = pageHeight - ch->yMax;
1855 xMax = pageHeight - ch->yMin;
1856 yMin = ch->xMin;
1857 yMax = ch->xMax;
1858 ch->xMin = xMin;
1859 ch->xMax = xMax;
1860 ch->yMin = yMin;
1861 ch->yMax = yMax;
1862 ch->rot = (ch->rot + 1) & 3;
1863 }
1864 t = pageWidth;
1865 pageWidth = pageHeight;
1866 pageHeight = t;
1867 break;
1868 }
1869
1870 return rot;
1871 }
1872
1873 // Rotate the TextUnderlines and TextLinks to match the transform
1874 // performed by rotateChars().
rotateUnderlinesAndLinks(int rot)1875 void TextPage::rotateUnderlinesAndLinks(int rot) {
1876 TextUnderline *underline;
1877 TextLink *link;
1878 double xMin, yMin, xMax, yMax;
1879 int i;
1880
1881 switch (rot) {
1882 case 0:
1883 default:
1884 break;
1885 case 1:
1886 for (i = 0; i < underlines->getLength(); ++i) {
1887 underline = (TextUnderline *)underlines->get(i);
1888 xMin = underline->y0;
1889 xMax = underline->y1;
1890 yMin = pageWidth - underline->x1;
1891 yMax = pageWidth - underline->x0;
1892 underline->x0 = xMin;
1893 underline->x1 = xMax;
1894 underline->y0 = yMin;
1895 underline->y1 = yMax;
1896 underline->horiz = !underline->horiz;
1897 }
1898 for (i = 0; i < links->getLength(); ++i) {
1899 link = (TextLink *)links->get(i);
1900 xMin = link->yMin;
1901 xMax = link->yMax;
1902 yMin = pageWidth - link->xMax;
1903 yMax = pageWidth - link->xMin;
1904 link->xMin = xMin;
1905 link->xMax = xMax;
1906 link->yMin = yMin;
1907 link->yMax = yMax;
1908 }
1909 break;
1910 case 2:
1911 for (i = 0; i < underlines->getLength(); ++i) {
1912 underline = (TextUnderline *)underlines->get(i);
1913 xMin = pageWidth - underline->x1;
1914 xMax = pageWidth - underline->x0;
1915 yMin = pageHeight - underline->y1;
1916 yMax = pageHeight - underline->y0;
1917 underline->x0 = xMin;
1918 underline->x1 = xMax;
1919 underline->y0 = yMin;
1920 underline->y1 = yMax;
1921 }
1922 for (i = 0; i < links->getLength(); ++i) {
1923 link = (TextLink *)links->get(i);
1924 xMin = pageWidth - link->xMax;
1925 xMax = pageWidth - link->xMin;
1926 yMin = pageHeight - link->yMax;
1927 yMax = pageHeight - link->yMin;
1928 link->xMin = xMin;
1929 link->xMax = xMax;
1930 link->yMin = yMin;
1931 link->yMax = yMax;
1932 }
1933 break;
1934 case 3:
1935 for (i = 0; i < underlines->getLength(); ++i) {
1936 underline = (TextUnderline *)underlines->get(i);
1937 xMin = pageHeight - underline->y1;
1938 xMax = pageHeight - underline->y0;
1939 yMin = underline->x0;
1940 yMax = underline->x1;
1941 underline->x0 = xMin;
1942 underline->x1 = xMax;
1943 underline->y0 = yMin;
1944 underline->y1 = yMax;
1945 underline->horiz = !underline->horiz;
1946 }
1947 for (i = 0; i < links->getLength(); ++i) {
1948 link = (TextLink *)links->get(i);
1949 xMin = pageHeight - link->yMax;
1950 xMax = pageHeight - link->yMin;
1951 yMin = link->xMin;
1952 yMax = link->xMax;
1953 link->xMin = xMin;
1954 link->xMax = xMax;
1955 link->yMin = yMin;
1956 link->yMax = yMax;
1957 }
1958 break;
1959 }
1960 }
1961
1962 // Undo the coordinate transform performed by rotateChars().
unrotateChars(GList * charsA,int rot)1963 void TextPage::unrotateChars(GList *charsA, int rot) {
1964 TextChar *ch;
1965 double xMin, yMin, xMax, yMax, t;
1966 int i;
1967
1968 switch (rot) {
1969 case 0:
1970 default:
1971 // no transform
1972 break;
1973 case 1:
1974 t = pageWidth;
1975 pageWidth = pageHeight;
1976 pageHeight = t;
1977 for (i = 0; i < charsA->getLength(); ++i) {
1978 ch = (TextChar *)charsA->get(i);
1979 xMin = pageWidth - ch->yMax;
1980 xMax = pageWidth - ch->yMin;
1981 yMin = ch->xMin;
1982 yMax = ch->xMax;
1983 ch->xMin = xMin;
1984 ch->xMax = xMax;
1985 ch->yMin = yMin;
1986 ch->yMax = yMax;
1987 ch->rot = (ch->rot + 1) & 3;
1988 }
1989 break;
1990 case 2:
1991 for (i = 0; i < charsA->getLength(); ++i) {
1992 ch = (TextChar *)charsA->get(i);
1993 xMin = pageWidth - ch->xMax;
1994 xMax = pageWidth - ch->xMin;
1995 yMin = pageHeight - ch->yMax;
1996 yMax = pageHeight - ch->yMin;
1997 ch->xMin = xMin;
1998 ch->xMax = xMax;
1999 ch->yMin = yMin;
2000 ch->yMax = yMax;
2001 ch->rot = (ch->rot + 2) & 3;
2002 }
2003 break;
2004 case 3:
2005 t = pageWidth;
2006 pageWidth = pageHeight;
2007 pageHeight = t;
2008 for (i = 0; i < charsA->getLength(); ++i) {
2009 ch = (TextChar *)charsA->get(i);
2010 xMin = ch->yMin;
2011 xMax = ch->yMax;
2012 yMin = pageHeight - ch->xMax;
2013 yMax = pageHeight - ch->xMin;
2014 ch->xMin = xMin;
2015 ch->xMax = xMax;
2016 ch->yMin = yMin;
2017 ch->yMax = yMax;
2018 ch->rot = (ch->rot + 3) & 3;
2019 }
2020 break;
2021 }
2022 }
2023
2024 // Undo the coordinate transform performed by rotateChars().
unrotateColumns(GList * columns,int rot)2025 void TextPage::unrotateColumns(GList *columns, int rot) {
2026 TextColumn *col;
2027 TextParagraph *par;
2028 TextLine *line;
2029 TextWord *word;
2030 double xMin, yMin, xMax, yMax, t;
2031 int colIdx, parIdx, lineIdx, wordIdx, i;
2032
2033 switch (rot) {
2034 case 0:
2035 default:
2036 // no transform
2037 break;
2038 case 1:
2039 t = pageWidth;
2040 pageWidth = pageHeight;
2041 pageHeight = t;
2042 for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
2043 col = (TextColumn *)columns->get(colIdx);
2044 xMin = pageWidth - col->yMax;
2045 xMax = pageWidth - col->yMin;
2046 yMin = col->xMin;
2047 yMax = col->xMax;
2048 col->xMin = xMin;
2049 col->xMax = xMax;
2050 col->yMin = yMin;
2051 col->yMax = yMax;
2052 for (parIdx = 0;
2053 parIdx < col->paragraphs->getLength();
2054 ++parIdx) {
2055 par = (TextParagraph *)col->paragraphs->get(parIdx);
2056 xMin = pageWidth - par->yMax;
2057 xMax = pageWidth - par->yMin;
2058 yMin = par->xMin;
2059 yMax = par->xMax;
2060 par->xMin = xMin;
2061 par->xMax = xMax;
2062 par->yMin = yMin;
2063 par->yMax = yMax;
2064 for (lineIdx = 0;
2065 lineIdx < par->lines->getLength();
2066 ++lineIdx) {
2067 line = (TextLine *)par->lines->get(lineIdx);
2068 xMin = pageWidth - line->yMax;
2069 xMax = pageWidth - line->yMin;
2070 yMin = line->xMin;
2071 yMax = line->xMax;
2072 line->xMin = xMin;
2073 line->xMax = xMax;
2074 line->yMin = yMin;
2075 line->yMax = yMax;
2076 line->rot = (line->rot + 1) & 3;
2077 for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
2078 word = (TextWord *)line->words->get(wordIdx);
2079 xMin = pageWidth - word->yMax;
2080 xMax = pageWidth - word->yMin;
2081 yMin = word->xMin;
2082 yMax = word->xMax;
2083 word->xMin = xMin;
2084 word->xMax = xMax;
2085 word->yMin = yMin;
2086 word->yMax = yMax;
2087 word->rot = (word->rot + 1) & 3;
2088 }
2089 }
2090 }
2091 }
2092 break;
2093 case 2:
2094 for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
2095 col = (TextColumn *)columns->get(colIdx);
2096 xMin = pageWidth - col->xMax;
2097 xMax = pageWidth - col->xMin;
2098 yMin = pageHeight - col->yMax;
2099 yMax = pageHeight - col->yMin;
2100 col->xMin = xMin;
2101 col->xMax = xMax;
2102 col->yMin = yMin;
2103 col->yMax = yMax;
2104 for (parIdx = 0;
2105 parIdx < col->paragraphs->getLength();
2106 ++parIdx) {
2107 par = (TextParagraph *)col->paragraphs->get(parIdx);
2108 xMin = pageWidth - par->xMax;
2109 xMax = pageWidth - par->xMin;
2110 yMin = pageHeight - par->yMax;
2111 yMax = pageHeight - par->yMin;
2112 par->xMin = xMin;
2113 par->xMax = xMax;
2114 par->yMin = yMin;
2115 par->yMax = yMax;
2116 for (lineIdx = 0;
2117 lineIdx < par->lines->getLength();
2118 ++lineIdx) {
2119 line = (TextLine *)par->lines->get(lineIdx);
2120 xMin = pageWidth - line->xMax;
2121 xMax = pageWidth - line->xMin;
2122 yMin = pageHeight - line->yMax;
2123 yMax = pageHeight - line->yMin;
2124 line->xMin = xMin;
2125 line->xMax = xMax;
2126 line->yMin = yMin;
2127 line->yMax = yMax;
2128 line->rot = (line->rot + 2) & 3;
2129 for (i = 0; i <= line->len; ++i) {
2130 line->edge[i] = pageWidth - line->edge[i];
2131 }
2132 for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
2133 word = (TextWord *)line->words->get(wordIdx);
2134 xMin = pageWidth - word->xMax;
2135 xMax = pageWidth - word->xMin;
2136 yMin = pageHeight - word->yMax;
2137 yMax = pageHeight - word->yMin;
2138 word->xMin = xMin;
2139 word->xMax = xMax;
2140 word->yMin = yMin;
2141 word->yMax = yMax;
2142 word->rot = (word->rot + 2) & 3;
2143 for (i = 0; i <= word->len; ++i) {
2144 word->edge[i] = pageWidth - word->edge[i];
2145 }
2146 }
2147 }
2148 }
2149 }
2150 break;
2151 case 3:
2152 t = pageWidth;
2153 pageWidth = pageHeight;
2154 pageHeight = t;
2155 for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
2156 col = (TextColumn *)columns->get(colIdx);
2157 xMin = col->yMin;
2158 xMax = col->yMax;
2159 yMin = pageHeight - col->xMax;
2160 yMax = pageHeight - col->xMin;
2161 col->xMin = xMin;
2162 col->xMax = xMax;
2163 col->yMin = yMin;
2164 col->yMax = yMax;
2165 for (parIdx = 0;
2166 parIdx < col->paragraphs->getLength();
2167 ++parIdx) {
2168 par = (TextParagraph *)col->paragraphs->get(parIdx);
2169 xMin = par->yMin;
2170 xMax = par->yMax;
2171 yMin = pageHeight - par->xMax;
2172 yMax = pageHeight - par->xMin;
2173 par->xMin = xMin;
2174 par->xMax = xMax;
2175 par->yMin = yMin;
2176 par->yMax = yMax;
2177 for (lineIdx = 0;
2178 lineIdx < par->lines->getLength();
2179 ++lineIdx) {
2180 line = (TextLine *)par->lines->get(lineIdx);
2181 xMin = line->yMin;
2182 xMax = line->yMax;
2183 yMin = pageHeight - line->xMax;
2184 yMax = pageHeight - line->xMin;
2185 line->xMin = xMin;
2186 line->xMax = xMax;
2187 line->yMin = yMin;
2188 line->yMax = yMax;
2189 line->rot = (line->rot + 3) & 3;
2190 for (i = 0; i <= line->len; ++i) {
2191 line->edge[i] = pageHeight - line->edge[i];
2192 }
2193 for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
2194 word = (TextWord *)line->words->get(wordIdx);
2195 xMin = word->yMin;
2196 xMax = word->yMax;
2197 yMin = pageHeight - word->xMax;
2198 yMax = pageHeight - word->xMin;
2199 word->xMin = xMin;
2200 word->xMax = xMax;
2201 word->yMin = yMin;
2202 word->yMax = yMax;
2203 word->rot = (word->rot + 3) & 3;
2204 for (i = 0; i <= word->len; ++i) {
2205 word->edge[i] = pageHeight - word->edge[i];
2206 }
2207 }
2208 }
2209 }
2210 }
2211 break;
2212 }
2213 }
2214
unrotateWords(GList * words,int rot)2215 void TextPage::unrotateWords(GList *words, int rot) {
2216 TextWord *word;
2217 double xMin, yMin, xMax, yMax;
2218 int i, j;
2219
2220 switch (rot) {
2221 case 0:
2222 default:
2223 // no transform
2224 break;
2225 case 1:
2226 for (i = 0; i < words->getLength(); ++i) {
2227 word = (TextWord *)words->get(i);
2228 xMin = pageWidth - word->yMax;
2229 xMax = pageWidth - word->yMin;
2230 yMin = word->xMin;
2231 yMax = word->xMax;
2232 word->xMin = xMin;
2233 word->xMax = xMax;
2234 word->yMin = yMin;
2235 word->yMax = yMax;
2236 word->rot = (word->rot + 1) & 3;
2237 }
2238 break;
2239 case 2:
2240 for (i = 0; i < words->getLength(); ++i) {
2241 word = (TextWord *)words->get(i);
2242 xMin = pageWidth - word->xMax;
2243 xMax = pageWidth - word->xMin;
2244 yMin = pageHeight - word->yMax;
2245 yMax = pageHeight - word->yMin;
2246 word->xMin = xMin;
2247 word->xMax = xMax;
2248 word->yMin = yMin;
2249 word->yMax = yMax;
2250 word->rot = (word->rot + 2) & 3;
2251 for (j = 0; j <= word->len; ++j) {
2252 word->edge[j] = pageWidth - word->edge[j];
2253 }
2254 }
2255 break;
2256 case 3:
2257 for (i = 0; i < words->getLength(); ++i) {
2258 word = (TextWord *)words->get(i);
2259 xMin = word->yMin;
2260 xMax = word->yMax;
2261 yMin = pageHeight - word->xMax;
2262 yMax = pageHeight - word->xMin;
2263 word->xMin = xMin;
2264 word->xMax = xMax;
2265 word->yMin = yMin;
2266 word->yMax = yMax;
2267 word->rot = (word->rot + 3) & 3;
2268 for (j = 0; j <= word->len; ++j) {
2269 word->edge[j] = pageHeight - word->edge[j];
2270 }
2271 }
2272 break;
2273 }
2274 }
2275
2276 // Determine the primary text direction (LR vs RL). Returns true for
2277 // LR, false for RL.
checkPrimaryLR(GList * charsA)2278 GBool TextPage::checkPrimaryLR(GList *charsA) {
2279 TextChar *ch;
2280 int i, lrCount;
2281
2282 lrCount = 0;
2283 for (i = 0; i < charsA->getLength(); ++i) {
2284 ch = (TextChar *)charsA->get(i);
2285 if (unicodeTypeL(ch->c)) {
2286 ++lrCount;
2287 } else if (unicodeTypeR(ch->c)) {
2288 --lrCount;
2289 }
2290 }
2291 return lrCount >= 0;
2292 }
2293
2294 // Remove duplicate characters. The list of chars has been sorted --
2295 // by x for rot=0,2; by y for rot=1,3.
removeDuplicates(GList * charsA,int rot)2296 void TextPage::removeDuplicates(GList *charsA, int rot) {
2297 TextChar *ch, *ch2;
2298 double xDelta, yDelta;
2299 int i, j;
2300
2301 if (rot & 1) {
2302 for (i = 0; i < charsA->getLength(); ++i) {
2303 ch = (TextChar *)charsA->get(i);
2304 xDelta = dupMaxSecDelta * ch->fontSize;
2305 yDelta = dupMaxPriDelta * ch->fontSize;
2306 j = i + 1;
2307 while (j < charsA->getLength()) {
2308 ch2 = (TextChar *)charsA->get(j);
2309 if (ch2->yMin - ch->yMin >= yDelta) {
2310 break;
2311 }
2312 if (ch2->c == ch->c &&
2313 fabs(ch2->xMin - ch->xMin) < xDelta &&
2314 fabs(ch2->xMax - ch->xMax) < xDelta &&
2315 fabs(ch2->yMax - ch->yMax) < yDelta) {
2316 charsA->del(j);
2317 } else {
2318 ++j;
2319 }
2320 }
2321 }
2322 } else {
2323 for (i = 0; i < charsA->getLength(); ++i) {
2324 ch = (TextChar *)charsA->get(i);
2325 xDelta = dupMaxPriDelta * ch->fontSize;
2326 yDelta = dupMaxSecDelta * ch->fontSize;
2327 j = i + 1;
2328 while (j < charsA->getLength()) {
2329 ch2 = (TextChar *)charsA->get(j);
2330 if (ch2->xMin - ch->xMin >= xDelta) {
2331 break;
2332 }
2333 if (ch2->c == ch->c &&
2334 fabs(ch2->xMax - ch->xMax) < xDelta &&
2335 fabs(ch2->yMin - ch->yMin) < yDelta &&
2336 fabs(ch2->yMax - ch->yMax) < yDelta) {
2337 charsA->del(j);
2338 } else {
2339 ++j;
2340 }
2341 }
2342 }
2343 }
2344 }
2345
2346 // Split the characters into trees of TextBlocks, one tree for each
2347 // rotation. Merge into a single tree (with the primary rotation).
splitChars(GList * charsA)2348 TextBlock *TextPage::splitChars(GList *charsA) {
2349 TextBlock *tree[4];
2350 TextBlock *blk;
2351 GList *chars2, *clippedChars;
2352 TextChar *ch;
2353 int rot, i;
2354
2355 // split: build a tree of TextBlocks for each rotation
2356 clippedChars = new GList();
2357 for (rot = 0; rot < 4; ++rot) {
2358 chars2 = new GList();
2359 for (i = 0; i < charsA->getLength(); ++i) {
2360 ch = (TextChar *)charsA->get(i);
2361 if (ch->rot == rot) {
2362 chars2->append(ch);
2363 }
2364 }
2365 tree[rot] = NULL;
2366 if (chars2->getLength() > 0) {
2367 chars2->sort((rot & 1) ? &TextChar::cmpY : &TextChar::cmpX);
2368 removeDuplicates(chars2, rot);
2369 if (control.clipText) {
2370 i = 0;
2371 while (i < chars2->getLength()) {
2372 ch = (TextChar *)chars2->get(i);
2373 if (ch->clipped) {
2374 ch = (TextChar *)chars2->del(i);
2375 clippedChars->append(ch);
2376 } else {
2377 ++i;
2378 }
2379 }
2380 }
2381 if (chars2->getLength() > 0) {
2382 tree[rot] = split(chars2, rot);
2383 }
2384 }
2385 delete chars2;
2386 }
2387
2388 // if the page contains no (unclipped) text, just leave an empty
2389 // column list
2390 if (!tree[0]) {
2391 delete clippedChars;
2392 return NULL;
2393 }
2394
2395 // if the main tree is not a multicolumn node, insert one so that
2396 // rotated text has somewhere to go
2397 if (tree[0]->tag != blkTagMulticolumn) {
2398 blk = new TextBlock(blkHorizSplit, 0);
2399 blk->addChild(tree[0]);
2400 blk->tag = blkTagMulticolumn;
2401 tree[0] = blk;
2402 }
2403
2404 // merge non-primary-rotation text into the primary-rotation tree
2405 for (rot = 1; rot < 4; ++rot) {
2406 if (tree[rot]) {
2407 insertIntoTree(tree[rot], tree[0]);
2408 tree[rot] = NULL;
2409 }
2410 }
2411
2412 if (clippedChars->getLength()) {
2413 insertClippedChars(clippedChars, tree[0]);
2414 }
2415 delete clippedChars;
2416
2417 #if 0 //~debug
2418 dumpTree(tree[0]);
2419 #endif
2420
2421 return tree[0];
2422 }
2423
2424 // Generate a tree of TextBlocks, marked as columns, lines, and words.
split(GList * charsA,int rot)2425 TextBlock *TextPage::split(GList *charsA, int rot) {
2426 TextBlock *blk;
2427 GList *chars2, *chars3;
2428 int *horizProfile, *vertProfile;
2429 double xMin, yMin, xMax, yMax;
2430 int xMinI, yMinI, xMaxI, yMaxI;
2431 int xMinI2, yMinI2, xMaxI2, yMaxI2;
2432 TextChar *ch;
2433 double minFontSize, avgFontSize, splitPrecision;
2434 double nLines, vertGapThreshold, ascentAdjust, descentAdjust, minChunk;
2435 int horizGapSize, vertGapSize;
2436 double horizGapSize2, vertGapSize2;
2437 int minHorizChunkWidth, minVertChunkWidth, nHorizGaps, nVertGaps;
2438 double largeCharSize;
2439 int nLargeChars;
2440 GBool doHorizSplit, doVertSplit, smallSplit;
2441 int i, x, y, prev, start;
2442
2443 //----- compute bbox, min font size, average font size, and
2444 // split precision for this block
2445
2446 xMin = yMin = xMax = yMax = 0; // make gcc happy
2447 minFontSize = avgFontSize = 0;
2448 for (i = 0; i < charsA->getLength(); ++i) {
2449 ch = (TextChar *)charsA->get(i);
2450 if (i == 0 || ch->xMin < xMin) {
2451 xMin = ch->xMin;
2452 }
2453 if (i == 0 || ch->yMin < yMin) {
2454 yMin = ch->yMin;
2455 }
2456 if (i == 0 || ch->xMax > xMax) {
2457 xMax = ch->xMax;
2458 }
2459 if (i == 0 || ch->yMax > yMax) {
2460 yMax = ch->yMax;
2461 }
2462 avgFontSize += ch->fontSize;
2463 if (i == 0 || ch->fontSize < minFontSize) {
2464 minFontSize = ch->fontSize;
2465 }
2466 }
2467 avgFontSize /= charsA->getLength();
2468 splitPrecision = splitPrecisionMul * minFontSize;
2469 if (splitPrecision < minSplitPrecision) {
2470 splitPrecision = minSplitPrecision;
2471 }
2472
2473 //----- compute the horizontal and vertical profiles
2474
2475 if (xMin / splitPrecision < 0.5 * INT_MIN ||
2476 xMax / splitPrecision > 0.5 * INT_MAX ||
2477 yMin / splitPrecision < 0.5 * INT_MIN ||
2478 xMax / splitPrecision > 0.5 * INT_MAX) {
2479 return NULL;
2480 }
2481 // add some slack to the array bounds to avoid floating point
2482 // precision problems
2483 xMinI = (int)floor(xMin / splitPrecision) - 1;
2484 yMinI = (int)floor(yMin / splitPrecision) - 1;
2485 xMaxI = (int)floor(xMax / splitPrecision) + 1;
2486 yMaxI = (int)floor(yMax / splitPrecision) + 1;
2487 horizProfile = (int *)gmallocn(yMaxI - yMinI + 1, sizeof(int));
2488 vertProfile = (int *)gmallocn(xMaxI - xMinI + 1, sizeof(int));
2489 memset(horizProfile, 0, (yMaxI - yMinI + 1) * sizeof(int));
2490 memset(vertProfile, 0, (xMaxI - xMinI + 1) * sizeof(int));
2491 for (i = 0; i < charsA->getLength(); ++i) {
2492 ch = (TextChar *)charsA->get(i);
2493 // yMinI2 and yMaxI2 are adjusted to allow for slightly overlapping lines
2494 switch (rot) {
2495 case 0:
2496 default:
2497 xMinI2 = (int)floor(ch->xMin / splitPrecision);
2498 xMaxI2 = (int)floor(ch->xMax / splitPrecision);
2499 ascentAdjust = ascentAdjustFactor * (ch->yMax - ch->yMin);
2500 yMinI2 = (int)floor((ch->yMin + ascentAdjust) / splitPrecision);
2501 descentAdjust = descentAdjustFactor * (ch->yMax - ch->yMin);
2502 yMaxI2 = (int)floor((ch->yMax - descentAdjust) / splitPrecision);
2503 break;
2504 case 1:
2505 descentAdjust = descentAdjustFactor * (ch->xMax - ch->xMin);
2506 xMinI2 = (int)floor((ch->xMin + descentAdjust) / splitPrecision);
2507 ascentAdjust = ascentAdjustFactor * (ch->xMax - ch->xMin);
2508 xMaxI2 = (int)floor((ch->xMax - ascentAdjust) / splitPrecision);
2509 yMinI2 = (int)floor(ch->yMin / splitPrecision);
2510 yMaxI2 = (int)floor(ch->yMax / splitPrecision);
2511 break;
2512 case 2:
2513 xMinI2 = (int)floor(ch->xMin / splitPrecision);
2514 xMaxI2 = (int)floor(ch->xMax / splitPrecision);
2515 descentAdjust = descentAdjustFactor * (ch->yMax - ch->yMin);
2516 yMinI2 = (int)floor((ch->yMin + descentAdjust) / splitPrecision);
2517 ascentAdjust = ascentAdjustFactor * (ch->yMax - ch->yMin);
2518 yMaxI2 = (int)floor((ch->yMax - ascentAdjust) / splitPrecision);
2519 break;
2520 case 3:
2521 ascentAdjust = ascentAdjustFactor * (ch->xMax - ch->xMin);
2522 xMinI2 = (int)floor((ch->xMin + ascentAdjust) / splitPrecision);
2523 descentAdjust = descentAdjustFactor * (ch->xMax - ch->xMin);
2524 xMaxI2 = (int)floor((ch->xMax - descentAdjust) / splitPrecision);
2525 yMinI2 = (int)floor(ch->yMin / splitPrecision);
2526 yMaxI2 = (int)floor(ch->yMax / splitPrecision);
2527 break;
2528 }
2529 for (y = yMinI2; y <= yMaxI2; ++y) {
2530 ++horizProfile[y - yMinI];
2531 }
2532 for (x = xMinI2; x <= xMaxI2; ++x) {
2533 ++vertProfile[x - xMinI];
2534 }
2535 }
2536
2537 //----- find the largest gaps in the horizontal and vertical profiles
2538
2539 horizGapSize = 0;
2540 for (start = yMinI; start < yMaxI && !horizProfile[start - yMinI]; ++start) ;
2541 for (y = start; y < yMaxI; ++y) {
2542 if (horizProfile[y - yMinI] && !horizProfile[y + 1 - yMinI]) {
2543 start = y;
2544 } else if (!horizProfile[y - yMinI] && horizProfile[y + 1 - yMinI]) {
2545 if (y - start > horizGapSize) {
2546 horizGapSize = y - start;
2547 }
2548 }
2549 }
2550 vertGapSize = 0;
2551 for (start = xMinI; start < xMaxI && !vertProfile[start - xMinI]; ++start) ;
2552 for (x = start; x < xMaxI; ++x) {
2553 if (vertProfile[x - xMinI] && !vertProfile[x + 1 - xMinI]) {
2554 start = x;
2555 } else if (!vertProfile[x - xMinI] && vertProfile[x + 1 - xMinI]) {
2556 if (x - start > vertGapSize) {
2557 vertGapSize = x - start;
2558 }
2559 }
2560 }
2561 horizGapSize2 = horizGapSize - splitGapSlack * avgFontSize / splitPrecision;
2562 if (horizGapSize2 < 0.99) {
2563 horizGapSize2 = 0.99;
2564 }
2565 vertGapSize2 = vertGapSize - splitGapSlack * avgFontSize / splitPrecision;
2566 if (vertGapSize2 < 0.99) {
2567 vertGapSize2 = 0.99;
2568 }
2569
2570 //----- count horiz/vert gaps equivalent to largest gaps
2571
2572 minHorizChunkWidth = yMaxI - yMinI;
2573 nHorizGaps = 0;
2574 for (start = yMinI; start < yMaxI && !horizProfile[start - yMinI]; ++start) ;
2575 prev = start - 1;
2576 for (y = start; y < yMaxI; ++y) {
2577 if (horizProfile[y - yMinI] && !horizProfile[y + 1 - yMinI]) {
2578 start = y;
2579 } else if (!horizProfile[y - yMinI] && horizProfile[y + 1 - yMinI]) {
2580 if (y - start > horizGapSize2) {
2581 ++nHorizGaps;
2582 if (start - prev < minHorizChunkWidth) {
2583 minHorizChunkWidth = start - prev;
2584 }
2585 prev = y;
2586 }
2587 }
2588 }
2589 minVertChunkWidth = xMaxI - xMinI;
2590 nVertGaps = 0;
2591 for (start = xMinI; start < xMaxI && !vertProfile[start - xMinI]; ++start) ;
2592 prev = start - 1;
2593 for (x = start; x < xMaxI; ++x) {
2594 if (vertProfile[x - xMinI] && !vertProfile[x + 1 - xMinI]) {
2595 start = x;
2596 } else if (!vertProfile[x - xMinI] && vertProfile[x + 1 - xMinI]) {
2597 if (x - start > vertGapSize2) {
2598 ++nVertGaps;
2599 if (start - prev < minVertChunkWidth) {
2600 minVertChunkWidth = start - prev;
2601 }
2602 prev = x;
2603 }
2604 }
2605 }
2606
2607 //----- compute splitting parameters
2608
2609 // approximation of number of lines in block
2610 if (fabs(avgFontSize) < 0.001) {
2611 nLines = 1;
2612 } else if (rot & 1) {
2613 nLines = (xMax - xMin) / avgFontSize;
2614 } else {
2615 nLines = (yMax - yMin) / avgFontSize;
2616 }
2617
2618 // compute the minimum allowed vertical gap size
2619 // (this is a horizontal gap threshold for rot=1,3
2620 if (control.mode == textOutTableLayout) {
2621 vertGapThreshold = vertGapThresholdTableMax
2622 + vertGapThresholdTableSlope * nLines;
2623 if (vertGapThreshold < vertGapThresholdTableMin) {
2624 vertGapThreshold = vertGapThresholdTableMin;
2625 }
2626 } else {
2627 vertGapThreshold = vertGapThresholdMax + vertGapThresholdSlope * nLines;
2628 if (vertGapThreshold < vertGapThresholdMin) {
2629 vertGapThreshold = vertGapThresholdMin;
2630 }
2631 }
2632 vertGapThreshold = vertGapThreshold * avgFontSize / splitPrecision;
2633
2634 // compute the minimum allowed chunk width
2635 if (control.mode == textOutTableLayout) {
2636 minChunk = 0;
2637 } else {
2638 minChunk = vertSplitChunkThreshold * avgFontSize / splitPrecision;
2639 }
2640
2641 // look for large chars
2642 // -- this kludge (multiply by 256, convert to int, divide by 256.0)
2643 // prevents floating point stability issues on x86 with gcc, where
2644 // largeCharSize could otherwise have slightly different values
2645 // here and where it's used below to do the large char partition
2646 // (because it gets truncated from 80 to 64 bits when spilled)
2647 largeCharSize = (int)(largeCharThreshold * avgFontSize * 256) / 256.0;
2648 nLargeChars = 0;
2649 for (i = 0; i < charsA->getLength(); ++i) {
2650 ch = (TextChar *)charsA->get(i);
2651 if (ch->fontSize > largeCharSize) {
2652 ++nLargeChars;
2653 }
2654 }
2655
2656 // figure out which type of split to do
2657 doHorizSplit = doVertSplit = gFalse;
2658 smallSplit = gFalse;
2659 if (rot & 1) {
2660 if (nHorizGaps > 0 &&
2661 (horizGapSize > vertGapSize || control.mode == textOutTableLayout) &&
2662 horizGapSize > vertGapThreshold &&
2663 minHorizChunkWidth > minChunk) {
2664 doHorizSplit = gTrue;
2665 } else if (nVertGaps > 0) {
2666 doVertSplit = gTrue;
2667 } else if (nLargeChars == 0 && nHorizGaps > 0) {
2668 doHorizSplit = gTrue;
2669 smallSplit = gTrue;
2670 }
2671 } else {
2672 if (nVertGaps > 0 &&
2673 (vertGapSize > horizGapSize || control.mode == textOutTableLayout) &&
2674 vertGapSize > vertGapThreshold &&
2675 minVertChunkWidth > minChunk) {
2676 doVertSplit = gTrue;
2677 } else if (nHorizGaps > 0) {
2678 doHorizSplit = gTrue;
2679 } else if (nLargeChars == 0 && nVertGaps > 0) {
2680 doVertSplit = gTrue;
2681 smallSplit = gTrue;
2682 }
2683 }
2684
2685 //----- split the block
2686
2687 //~ this could use "other content" (vector graphics, rotated text) --
2688 //~ presence of other content in a gap means we should definitely split
2689
2690 // split vertically
2691 if (doVertSplit) {
2692 blk = new TextBlock(blkVertSplit, rot);
2693 blk->smallSplit = smallSplit;
2694 for (start = xMinI; start < xMaxI && !vertProfile[start - xMinI]; ++start) ;
2695 prev = start - 1;
2696 for (x = start; x < xMaxI; ++x) {
2697 if (vertProfile[x - xMinI] && !vertProfile[x + 1 - xMinI]) {
2698 start = x;
2699 } else if (!vertProfile[x - xMinI] && vertProfile[x + 1 - xMinI]) {
2700 if (x - start > vertGapSize2) {
2701 chars2 = getChars(charsA, (prev + 0.5) * splitPrecision, yMin - 1,
2702 (start + 1.5) * splitPrecision, yMax + 1);
2703 blk->addChild(split(chars2, rot));
2704 delete chars2;
2705 prev = x;
2706 }
2707 }
2708 }
2709 chars2 = getChars(charsA, (prev + 0.5) * splitPrecision, yMin - 1,
2710 xMax + 1, yMax + 1);
2711 blk->addChild(split(chars2, rot));
2712 delete chars2;
2713
2714 // split horizontally
2715 } else if (doHorizSplit) {
2716 blk = new TextBlock(blkHorizSplit, rot);
2717 blk->smallSplit = smallSplit;
2718 for (start = yMinI;
2719 start < yMaxI && !horizProfile[start - yMinI];
2720 ++start) ;
2721 prev = start - 1;
2722 for (y = start; y < yMaxI; ++y) {
2723 if (horizProfile[y - yMinI] && !horizProfile[y + 1 - yMinI]) {
2724 start = y;
2725 } else if (!horizProfile[y - yMinI] && horizProfile[y + 1 - yMinI]) {
2726 if (y - start > horizGapSize2) {
2727 chars2 = getChars(charsA, xMin - 1, (prev + 0.5) * splitPrecision,
2728 xMax + 1, (start + 1.5) * splitPrecision);
2729 blk->addChild(split(chars2, rot));
2730 delete chars2;
2731 prev = y;
2732 }
2733 }
2734 }
2735 chars2 = getChars(charsA, xMin - 1, (prev + 0.5) * splitPrecision,
2736 xMax + 1, yMax + 1);
2737 blk->addChild(split(chars2, rot));
2738 delete chars2;
2739
2740 // split into larger and smaller chars
2741 } else if (nLargeChars > 0) {
2742 chars2 = new GList();
2743 chars3 = new GList();
2744 for (i = 0; i < charsA->getLength(); ++i) {
2745 ch = (TextChar *)charsA->get(i);
2746 if (ch->fontSize > largeCharSize) {
2747 chars2->append(ch);
2748 } else {
2749 chars3->append(ch);
2750 }
2751 }
2752 blk = split(chars3, rot);
2753 insertLargeChars(chars2, blk);
2754 delete chars2;
2755 delete chars3;
2756
2757 // create a leaf node
2758 } else {
2759 blk = new TextBlock(blkLeaf, rot);
2760 for (i = 0; i < charsA->getLength(); ++i) {
2761 blk->addChild((TextChar *)charsA->get(i));
2762 }
2763 }
2764
2765 gfree(horizProfile);
2766 gfree(vertProfile);
2767
2768 tagBlock(blk);
2769
2770 return blk;
2771 }
2772
2773 // Return the subset of chars inside a rectangle.
getChars(GList * charsA,double xMin,double yMin,double xMax,double yMax)2774 GList *TextPage::getChars(GList *charsA, double xMin, double yMin,
2775 double xMax, double yMax) {
2776 GList *ret;
2777 TextChar *ch;
2778 double x, y;
2779 int i;
2780
2781 ret = new GList();
2782 for (i = 0; i < charsA->getLength(); ++i) {
2783 ch = (TextChar *)charsA->get(i);
2784 // because of {ascent,descent}AdjustFactor, the y coords (or x
2785 // coords for rot 1,3) for the gaps will be a little bit tight --
2786 // so we use the center of the character here
2787 x = 0.5 * (ch->xMin + ch->xMax);
2788 y = 0.5 * (ch->yMin + ch->yMax);
2789 if (x > xMin && x < xMax && y > yMin && y < yMax) {
2790 ret->append(ch);
2791 }
2792 }
2793 return ret;
2794 }
2795
2796 // Decide whether this block is a line, column, or multiple columns:
2797 // - all leaf nodes are lines
2798 // - horiz split nodes whose children are lines or columns are columns
2799 // - other horiz split nodes are multiple columns
2800 // - vert split nodes, with small gaps, whose children are lines are lines
2801 // - other vert split nodes are multiple columns
2802 // (for rot=1,3: the horiz and vert splits are swapped)
2803 // In table layout mode:
2804 // - all leaf nodes are lines
2805 // - vert split nodes, with small gaps, whose children are lines are lines
2806 // - everything else is multiple columns
tagBlock(TextBlock * blk)2807 void TextPage::tagBlock(TextBlock *blk) {
2808 TextBlock *child;
2809 int i;
2810
2811 if (control.mode == textOutTableLayout) {
2812 if (blk->type == blkLeaf) {
2813 blk->tag = blkTagLine;
2814 } else if (blk->type == ((blk->rot & 1) ? blkHorizSplit : blkVertSplit) &&
2815 blk->smallSplit) {
2816 blk->tag = blkTagLine;
2817 for (i = 0; i < blk->children->getLength(); ++i) {
2818 child = (TextBlock *)blk->children->get(i);
2819 if (child->tag != blkTagLine) {
2820 blk->tag = blkTagMulticolumn;
2821 break;
2822 }
2823 }
2824 } else {
2825 blk->tag = blkTagMulticolumn;
2826 }
2827 return;
2828 }
2829
2830 if (blk->type == blkLeaf) {
2831 blk->tag = blkTagLine;
2832
2833 } else {
2834 if (blk->type == ((blk->rot & 1) ? blkVertSplit : blkHorizSplit)) {
2835 blk->tag = blkTagColumn;
2836 for (i = 0; i < blk->children->getLength(); ++i) {
2837 child = (TextBlock *)blk->children->get(i);
2838 if (child->tag != blkTagColumn && child->tag != blkTagLine) {
2839 blk->tag = blkTagMulticolumn;
2840 break;
2841 }
2842 }
2843 } else {
2844 if (blk->smallSplit) {
2845 blk->tag = blkTagLine;
2846 for (i = 0; i < blk->children->getLength(); ++i) {
2847 child = (TextBlock *)blk->children->get(i);
2848 if (child->tag != blkTagLine) {
2849 blk->tag = blkTagMulticolumn;
2850 break;
2851 }
2852 }
2853 } else {
2854 blk->tag = blkTagMulticolumn;
2855 }
2856 }
2857 }
2858 }
2859
2860 // Insert a list of large characters into a tree.
insertLargeChars(GList * largeChars,TextBlock * blk)2861 void TextPage::insertLargeChars(GList *largeChars, TextBlock *blk) {
2862 TextChar *ch, *ch2;
2863 GBool singleLine;
2864 double xLimit, yLimit, minOverlap;
2865 int i;
2866
2867 //~ this currently works only for characters in the primary rotation
2868
2869 // check to see if the large chars are a single line, in the
2870 // upper-left corner of blk (this is just a rough estimate)
2871 xLimit = blk->xMin + 0.5 * (blk->xMin + blk->xMax);
2872 yLimit = blk->yMin + 0.5 * (blk->yMin + blk->yMax);
2873 singleLine = gTrue;
2874 // note: largeChars are already sorted by x
2875 for (i = 0; i < largeChars->getLength(); ++i) {
2876 ch2 = (TextChar *)largeChars->get(i);
2877 if (ch2->xMax > xLimit || ch2->yMax > yLimit) {
2878 singleLine = gFalse;
2879 break;
2880 }
2881 if (i > 0) {
2882 ch = (TextChar *)largeChars->get(i-1);
2883 minOverlap = 0.5 * (ch->fontSize < ch2->fontSize ? ch->fontSize
2884 : ch2->fontSize);
2885 if (ch->yMax - ch2->yMin < minOverlap ||
2886 ch2->yMax - ch->yMin < minOverlap) {
2887 singleLine = gFalse;
2888 break;
2889 }
2890 }
2891 }
2892
2893 if (singleLine) {
2894 // if the large chars are a single line, prepend them to the first
2895 // leaf node in blk
2896 insertLargeCharsInFirstLeaf(largeChars, blk);
2897 } else {
2898 // if the large chars are not a single line, prepend each one to
2899 // the appropriate leaf node -- this handles cases like bullets
2900 // drawn in a large font, on the left edge of a column
2901 for (i = largeChars->getLength() - 1; i >= 0; --i) {
2902 ch = (TextChar *)largeChars->get(i);
2903 insertLargeCharInLeaf(ch, blk);
2904 }
2905 }
2906 }
2907
2908 // Find the first leaf (in depth-first order) in blk, and prepend a
2909 // list of large chars.
insertLargeCharsInFirstLeaf(GList * largeChars,TextBlock * blk)2910 void TextPage::insertLargeCharsInFirstLeaf(GList *largeChars, TextBlock *blk) {
2911 TextChar *ch;
2912 int i;
2913
2914 if (blk->type == blkLeaf) {
2915 for (i = largeChars->getLength() - 1; i >= 0; --i) {
2916 ch = (TextChar *)largeChars->get(i);
2917 blk->prependChild(ch);
2918 }
2919 } else {
2920 insertLargeCharsInFirstLeaf(largeChars, (TextBlock *)blk->children->get(0));
2921 blk->updateBounds(0);
2922 }
2923 }
2924
2925 // Find the leaf in <blk> where large char <ch> belongs, and prepend
2926 // it.
insertLargeCharInLeaf(TextChar * ch,TextBlock * blk)2927 void TextPage::insertLargeCharInLeaf(TextChar *ch, TextBlock *blk) {
2928 TextBlock *child;
2929 double y;
2930 int i;
2931
2932 //~ this currently works only for characters in the primary rotation
2933
2934 //~ this currently just looks down the left edge of blk
2935 //~ -- it could be extended to do more
2936
2937 // estimate the baseline of ch
2938 y = ch->yMin + 0.75 * (ch->yMax - ch->yMin);
2939
2940 if (blk->type == blkLeaf) {
2941 blk->prependChild(ch);
2942 } else if (blk->type == blkHorizSplit) {
2943 for (i = 0; i < blk->children->getLength(); ++i) {
2944 child = (TextBlock *)blk->children->get(i);
2945 if (y < child->yMax || i == blk->children->getLength() - 1) {
2946 insertLargeCharInLeaf(ch, child);
2947 blk->updateBounds(i);
2948 break;
2949 }
2950 }
2951 } else {
2952 insertLargeCharInLeaf(ch, (TextBlock *)blk->children->get(0));
2953 blk->updateBounds(0);
2954 }
2955 }
2956
2957 // Merge blk (rot != 0) into primaryTree (rot == 0).
insertIntoTree(TextBlock * blk,TextBlock * primaryTree)2958 void TextPage::insertIntoTree(TextBlock *blk, TextBlock *primaryTree) {
2959 TextBlock *child;
2960
2961 // we insert a whole column at a time - so call insertIntoTree
2962 // recursively until we get to a column (or line)
2963
2964 if (blk->tag == blkTagMulticolumn) {
2965 while (blk->children->getLength()) {
2966 child = (TextBlock *)blk->children->del(0);
2967 insertIntoTree(child, primaryTree);
2968 }
2969 delete blk;
2970 } else {
2971 insertColumnIntoTree(blk, primaryTree);
2972 }
2973 }
2974
2975 // Insert a column (as an atomic subtree) into tree.
2976 // Requirement: tree is not a leaf node.
insertColumnIntoTree(TextBlock * column,TextBlock * tree)2977 void TextPage::insertColumnIntoTree(TextBlock *column, TextBlock *tree) {
2978 TextBlock *child;
2979 int i;
2980
2981 for (i = 0; i < tree->children->getLength(); ++i) {
2982 child = (TextBlock *)tree->children->get(i);
2983 if (child->tag == blkTagMulticolumn &&
2984 column->xMin >= child->xMin &&
2985 column->yMin >= child->yMin &&
2986 column->xMax <= child->xMax &&
2987 column->yMax <= child->yMax) {
2988 insertColumnIntoTree(column, child);
2989 tree->tag = blkTagMulticolumn;
2990 return;
2991 }
2992 }
2993
2994 if (tree->type == blkVertSplit) {
2995 if (tree->rot == 1 || tree->rot == 2) {
2996 for (i = 0; i < tree->children->getLength(); ++i) {
2997 child = (TextBlock *)tree->children->get(i);
2998 if (column->xMax > 0.5 * (child->xMin + child->xMax)) {
2999 break;
3000 }
3001 }
3002 } else {
3003 for (i = 0; i < tree->children->getLength(); ++i) {
3004 child = (TextBlock *)tree->children->get(i);
3005 if (column->xMin < 0.5 * (child->xMin + child->xMax)) {
3006 break;
3007 }
3008 }
3009 }
3010 } else if (tree->type == blkHorizSplit) {
3011 if (tree->rot >= 2) {
3012 for (i = 0; i < tree->children->getLength(); ++i) {
3013 child = (TextBlock *)tree->children->get(i);
3014 if (column->yMax > 0.5 * (child->yMin + child->yMax)) {
3015 break;
3016 }
3017 }
3018 } else {
3019 for (i = 0; i < tree->children->getLength(); ++i) {
3020 child = (TextBlock *)tree->children->get(i);
3021 if (column->yMin < 0.5 * (child->yMin + child->yMax)) {
3022 break;
3023 }
3024 }
3025 }
3026 } else {
3027 // this should never happen
3028 return;
3029 }
3030 tree->children->insert(i, column);
3031 tree->tag = blkTagMulticolumn;
3032 }
3033
3034 // Insert clipped characters back into the TextBlock tree.
insertClippedChars(GList * clippedChars,TextBlock * tree)3035 void TextPage::insertClippedChars(GList *clippedChars, TextBlock *tree) {
3036 TextChar *ch, *ch2;
3037 TextBlock *leaf;
3038 double y;
3039 int i;
3040
3041 //~ this currently works only for characters in the primary rotation
3042
3043 clippedChars->sort(TextChar::cmpX);
3044 while (clippedChars->getLength()) {
3045 ch = (TextChar *)clippedChars->del(0);
3046 if (ch->rot != 0) {
3047 continue;
3048 }
3049 if (!(leaf = findClippedCharLeaf(ch, tree))) {
3050 continue;
3051 }
3052 leaf->addChild(ch);
3053 i = 0;
3054 while (i < clippedChars->getLength()) {
3055 ch2 = (TextChar *)clippedChars->get(i);
3056 if (ch2->xMin > ch->xMax + clippedTextMaxWordSpace * ch->fontSize) {
3057 break;
3058 }
3059 y = 0.5 * (ch2->yMin + ch2->yMax);
3060 if (y > leaf->yMin && y < leaf->yMax) {
3061 ch2 = (TextChar *)clippedChars->del(i);
3062 leaf->addChild(ch2);
3063 ch = ch2;
3064 } else {
3065 ++i;
3066 }
3067 }
3068 }
3069 }
3070
3071 // Find the leaf in <tree> to which clipped char <ch> can be appended.
3072 // Returns NULL if there is no appropriate append point.
findClippedCharLeaf(TextChar * ch,TextBlock * tree)3073 TextBlock *TextPage::findClippedCharLeaf(TextChar *ch, TextBlock *tree) {
3074 TextBlock *ret, *child;
3075 double y;
3076 int i;
3077
3078 //~ this currently works only for characters in the primary rotation
3079
3080 y = 0.5 * (ch->yMin + ch->yMax);
3081 if (tree->type == blkLeaf) {
3082 if (tree->rot == 0) {
3083 if (y > tree->yMin && y < tree->yMax &&
3084 ch->xMin <= tree->xMax + clippedTextMaxWordSpace * ch->fontSize) {
3085 return tree;
3086 }
3087 }
3088 } else {
3089 for (i = 0; i < tree->children->getLength(); ++i) {
3090 child = (TextBlock *)tree->children->get(i);
3091 if ((ret = findClippedCharLeaf(ch, child))) {
3092 return ret;
3093 }
3094 }
3095 }
3096 return NULL;
3097 }
3098
3099 // Convert the tree of TextBlocks into a list of TextColumns.
buildColumns(TextBlock * tree)3100 GList *TextPage::buildColumns(TextBlock *tree) {
3101 GList *columns;
3102
3103 columns = new GList();
3104 buildColumns2(tree, columns);
3105 return columns;
3106 }
3107
buildColumns2(TextBlock * blk,GList * columns)3108 void TextPage::buildColumns2(TextBlock *blk, GList *columns) {
3109 TextColumn *col;
3110 int i;
3111
3112 switch (blk->tag) {
3113 case blkTagLine:
3114 case blkTagColumn:
3115 col = buildColumn(blk);
3116 columns->append(col);
3117 break;
3118 case blkTagMulticolumn:
3119 for (i = 0; i < blk->children->getLength(); ++i) {
3120 buildColumns2((TextBlock *)blk->children->get(i), columns);
3121 }
3122 break;
3123 }
3124 }
3125
buildColumn(TextBlock * blk)3126 TextColumn *TextPage::buildColumn(TextBlock *blk) {
3127 GList *lines, *parLines;
3128 GList *paragraphs;
3129 TextLine *line0, *line1;
3130 double spaceThresh, indent0, indent1, fontSize0, fontSize1;
3131 int i;
3132
3133 lines = new GList();
3134 buildLines(blk, lines);
3135
3136 spaceThresh = paragraphSpacingThreshold * getAverageLineSpacing(lines);
3137
3138 //~ could look for bulleted lists here: look for the case where
3139 //~ all out-dented lines start with the same char
3140
3141 // build the paragraphs
3142 paragraphs = new GList();
3143 i = 0;
3144 while (i < lines->getLength()) {
3145
3146 // get the first line of the paragraph
3147 parLines = new GList();
3148 line0 = (TextLine *)lines->get(i);
3149 parLines->append(line0);
3150 ++i;
3151
3152 if (i < lines->getLength()) {
3153 line1 = (TextLine *)lines->get(i);
3154 indent0 = getLineIndent(line0, blk);
3155 indent1 = getLineIndent(line1, blk);
3156 fontSize0 = line0->fontSize;
3157 fontSize1 = line1->fontSize;
3158
3159 // inverted indent
3160 if (indent1 - indent0 > minParagraphIndent * fontSize0 &&
3161 fabs(fontSize0 - fontSize1) <= paragraphFontSizeDelta &&
3162 getLineSpacing(line0, line1) <= spaceThresh) {
3163 parLines->append(line1);
3164 indent0 = indent1;
3165 for (++i; i < lines->getLength(); ++i) {
3166 line1 = (TextLine *)lines->get(i);
3167 indent1 = getLineIndent(line1, blk);
3168 fontSize1 = line1->fontSize;
3169 if (indent0 - indent1 > minParagraphIndent * fontSize0) {
3170 break;
3171 }
3172 if (fabs(fontSize0 - fontSize1) > paragraphFontSizeDelta) {
3173 break;
3174 }
3175 if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
3176 > spaceThresh) {
3177 break;
3178 }
3179 parLines->append(line1);
3180 }
3181
3182 // drop cap
3183 } else if (fontSize0 > largeCharThreshold * fontSize1 &&
3184 indent1 - indent0 > minParagraphIndent * fontSize1 &&
3185 getLineSpacing(line0, line1) < 0) {
3186 parLines->append(line1);
3187 fontSize0 = fontSize1;
3188 for (++i; i < lines->getLength(); ++i) {
3189 line1 = (TextLine *)lines->get(i);
3190 indent1 = getLineIndent(line1, blk);
3191 if (indent1 - indent0 <= minParagraphIndent * fontSize0) {
3192 break;
3193 }
3194 if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
3195 > spaceThresh) {
3196 break;
3197 }
3198 parLines->append(line1);
3199 }
3200 for (; i < lines->getLength(); ++i) {
3201 line1 = (TextLine *)lines->get(i);
3202 indent1 = getLineIndent(line1, blk);
3203 fontSize1 = line1->fontSize;
3204 if (indent1 - indent0 > minParagraphIndent * fontSize0) {
3205 break;
3206 }
3207 if (fabs(fontSize0 - fontSize1) > paragraphFontSizeDelta) {
3208 break;
3209 }
3210 if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
3211 > spaceThresh) {
3212 break;
3213 }
3214 parLines->append(line1);
3215 }
3216
3217 // regular indent or no indent
3218 } else if (fabs(fontSize0 - fontSize1) <= paragraphFontSizeDelta &&
3219 getLineSpacing(line0, line1) <= spaceThresh) {
3220 parLines->append(line1);
3221 indent0 = indent1;
3222 for (++i; i < lines->getLength(); ++i) {
3223 line1 = (TextLine *)lines->get(i);
3224 indent1 = getLineIndent(line1, blk);
3225 fontSize1 = line1->fontSize;
3226 if (indent1 - indent0 > minParagraphIndent * fontSize0) {
3227 break;
3228 }
3229 if (fabs(fontSize0 - fontSize1) > paragraphFontSizeDelta) {
3230 break;
3231 }
3232 if (getLineSpacing((TextLine *)lines->get(i - 1), line1)
3233 > spaceThresh) {
3234 break;
3235 }
3236 parLines->append(line1);
3237 }
3238 }
3239 }
3240
3241 paragraphs->append(new TextParagraph(parLines));
3242 }
3243
3244 delete lines;
3245
3246 return new TextColumn(paragraphs, blk->xMin, blk->yMin,
3247 blk->xMax, blk->yMax);
3248 }
3249
getLineIndent(TextLine * line,TextBlock * blk)3250 double TextPage::getLineIndent(TextLine *line, TextBlock *blk) {
3251 double indent;
3252
3253 switch (line->rot) {
3254 case 0:
3255 default: indent = line->xMin - blk->xMin; break;
3256 case 1: indent = line->yMin - blk->yMin; break;
3257 case 2: indent = blk->xMax - line->xMax; break;
3258 case 3: indent = blk->yMax - line->yMax; break;
3259 }
3260 return indent;
3261 }
3262
3263 // Compute average line spacing in column.
getAverageLineSpacing(GList * lines)3264 double TextPage::getAverageLineSpacing(GList *lines) {
3265 double avg, sp;
3266 int n, i;
3267
3268 avg = 0;
3269 n = 0;
3270 for (i = 1; i < lines->getLength(); ++i) {
3271 sp = getLineSpacing((TextLine *)lines->get(i - 1),
3272 (TextLine *)lines->get(i));
3273 if (sp > 0) {
3274 avg += sp;
3275 ++n;
3276 }
3277 }
3278 if (n > 0) {
3279 avg /= n;
3280 }
3281 return avg;
3282 }
3283
3284 // Compute the space between two lines.
getLineSpacing(TextLine * line0,TextLine * line1)3285 double TextPage::getLineSpacing(TextLine *line0, TextLine *line1) {
3286 double sp;
3287
3288 switch (line0->rot) {
3289 case 0:
3290 default: sp = line1->yMin - line0->yMax; break;
3291 case 1: sp = line0->xMin - line1->xMax; break;
3292 case 2: sp = line0->yMin - line1->yMin; break;
3293 case 3: sp = line1->xMin - line1->xMax; break;
3294 }
3295 return sp;
3296 }
3297
buildLines(TextBlock * blk,GList * lines)3298 void TextPage::buildLines(TextBlock *blk, GList *lines) {
3299 TextLine *line;
3300 int i;
3301
3302 switch (blk->tag) {
3303 case blkTagLine:
3304 line = buildLine(blk);
3305 if (blk->rot == 1 || blk->rot == 2) {
3306 lines->insert(0, line);
3307 } else {
3308 lines->append(line);
3309 }
3310 break;
3311 case blkTagColumn:
3312 case blkTagMulticolumn: // multicolumn should never happen here
3313 for (i = 0; i < blk->children->getLength(); ++i) {
3314 buildLines((TextBlock *)blk->children->get(i), lines);
3315 }
3316 break;
3317 }
3318 }
3319
buildLine(TextBlock * blk)3320 TextLine *TextPage::buildLine(TextBlock *blk) {
3321 GList *charsA;
3322 GList *words;
3323 TextChar *ch, *ch2;
3324 TextWord *word;
3325 double wordSp, lineFontSize, sp;
3326 GBool spaceAfter, spaceAfter2;
3327 int i, j;
3328
3329 charsA = new GList();
3330 getLineChars(blk, charsA);
3331
3332 wordSp = computeWordSpacingThreshold(charsA, blk->rot);
3333
3334 words = new GList();
3335 lineFontSize = 0;
3336 spaceAfter = gFalse;
3337 i = 0;
3338 while (i < charsA->getLength()) {
3339 sp = wordSp - 1;
3340 for (j = i+1; j < charsA->getLength(); ++j) {
3341 ch = (TextChar *)charsA->get(j-1);
3342 ch2 = (TextChar *)charsA->get(j);
3343 sp = (blk->rot & 1) ? (ch2->yMin - ch->yMax) : (ch2->xMin - ch->xMax);
3344 if (sp > wordSp ||
3345 ch->font != ch2->font ||
3346 fabs(ch->fontSize - ch2->fontSize) > 0.01 ||
3347 (control.mode == textOutRawOrder &&
3348 ch2->charPos != ch->charPos + ch->charLen)) {
3349 break;
3350 }
3351 sp = wordSp - 1;
3352 }
3353 spaceAfter2 = spaceAfter;
3354 spaceAfter = sp > wordSp;
3355 word = new TextWord(charsA, i, j - i, blk->rot,
3356 (blk->rot >= 2) ? spaceAfter2 : spaceAfter);
3357 i = j;
3358 if (blk->rot >= 2) {
3359 words->insert(0, word);
3360 } else {
3361 words->append(word);
3362 }
3363 if (i == 0 || word->fontSize > lineFontSize) {
3364 lineFontSize = word->fontSize;
3365 }
3366 }
3367
3368 delete charsA;
3369
3370 return new TextLine(words, blk->xMin, blk->yMin, blk->xMax, blk->yMax,
3371 lineFontSize);
3372 }
3373
getLineChars(TextBlock * blk,GList * charsA)3374 void TextPage::getLineChars(TextBlock *blk, GList *charsA) {
3375 int i;
3376
3377 if (blk->type == blkLeaf) {
3378 charsA->append(blk->children);
3379 } else {
3380 for (i = 0; i < blk->children->getLength(); ++i) {
3381 getLineChars((TextBlock *)blk->children->get(i), charsA);
3382 }
3383 }
3384 }
3385
3386 // Compute the inter-word spacing threshold for a line of chars.
3387 // Spaces greater than this threshold will be considered inter-word
3388 // spaces.
computeWordSpacingThreshold(GList * charsA,int rot)3389 double TextPage::computeWordSpacingThreshold(GList *charsA, int rot) {
3390 TextChar *ch, *ch2;
3391 double avgFontSize, minSp, maxSp, sp;
3392 int i;
3393
3394 avgFontSize = 0;
3395 minSp = maxSp = 0;
3396 for (i = 0; i < charsA->getLength(); ++i) {
3397 ch = (TextChar *)charsA->get(i);
3398 avgFontSize += ch->fontSize;
3399 if (i < charsA->getLength() - 1) {
3400 ch2 = (TextChar *)charsA->get(i+1);
3401 sp = (rot & 1) ? (ch2->yMin - ch->yMax) : (ch2->xMin - ch->xMax);
3402 if (i == 0 || sp < minSp) {
3403 minSp = sp;
3404 }
3405 if (sp > maxSp) {
3406 maxSp = sp;
3407 }
3408 }
3409 }
3410 avgFontSize /= charsA->getLength();
3411 if (minSp < 0) {
3412 minSp = 0;
3413 }
3414
3415 // if spacing is completely uniform, assume it's a single word
3416 // (technically it could be either "ABC" or "A B C", but it's
3417 // essentially impossible to tell)
3418 if (maxSp - minSp < uniformSpacing * avgFontSize) {
3419 return maxSp + 1;
3420
3421 // if there is some variation in spacing, but it's small, assume
3422 // there are some inter-word spaces
3423 } else if (maxSp - minSp < wordSpacing * avgFontSize) {
3424 return 0.5 * (minSp + maxSp);
3425
3426 // otherwise, assume a reasonable threshold for inter-word spacing
3427 // (we can't use something like 0.5*(minSp+maxSp) here because there
3428 // can be outliers at the high end)
3429 } else {
3430 return minSp + wordSpacing * avgFontSize;
3431 }
3432 }
3433
assignPhysLayoutPositions(GList * columns)3434 int TextPage::assignPhysLayoutPositions(GList *columns) {
3435 assignLinePhysPositions(columns);
3436 return assignColumnPhysPositions(columns);
3437 }
3438
3439 // Assign a physical x coordinate for each TextLine (relative to the
3440 // containing TextColumn). This also computes TextColumn width and
3441 // height.
assignLinePhysPositions(GList * columns)3442 void TextPage::assignLinePhysPositions(GList *columns) {
3443 TextColumn *col;
3444 TextParagraph *par;
3445 TextLine *line;
3446 UnicodeMap *uMap;
3447 int colIdx, parIdx, lineIdx;
3448
3449 if (!(uMap = globalParams->getTextEncoding())) {
3450 return;
3451 }
3452
3453 for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
3454 col = (TextColumn *)columns->get(colIdx);
3455 col->pw = col->ph = 0;
3456 for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
3457 par = (TextParagraph *)col->paragraphs->get(parIdx);
3458 for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
3459 line = (TextLine *)par->lines->get(lineIdx);
3460 computeLinePhysWidth(line, uMap);
3461 if (control.fixedPitch > 0) {
3462 line->px = (int)((line->xMin - col->xMin) / control.fixedPitch);
3463 } else if (fabs(line->fontSize) < 0.001) {
3464 line->px = 0;
3465 } else {
3466 line->px = (int)((line->xMin - col->xMin) /
3467 (physLayoutSpaceWidth * line->fontSize));
3468 }
3469 if (line->px + line->pw > col->pw) {
3470 col->pw = line->px + line->pw;
3471 }
3472 }
3473 col->ph += par->lines->getLength();
3474 }
3475 col->ph += col->paragraphs->getLength() - 1;
3476 }
3477
3478 uMap->decRefCnt();
3479 }
3480
computeLinePhysWidth(TextLine * line,UnicodeMap * uMap)3481 void TextPage::computeLinePhysWidth(TextLine *line, UnicodeMap *uMap) {
3482 char buf[8];
3483 int n, i;
3484
3485 if (uMap->isUnicode()) {
3486 line->pw = line->len;
3487 } else {
3488 line->pw = 0;
3489 for (i = 0; i < line->len; ++i) {
3490 n = uMap->mapUnicode(line->text[i], buf, sizeof(buf));
3491 line->pw += n;
3492 }
3493 }
3494 }
3495
3496 // Assign physical x and y coordinates for each TextColumn. Returns
3497 // the text height (max physical y + 1).
assignColumnPhysPositions(GList * columns)3498 int TextPage::assignColumnPhysPositions(GList *columns) {
3499 TextColumn *col, *col2;
3500 double slack, xOverlap, yOverlap;
3501 int ph, i, j;
3502
3503 if (control.mode == textOutTableLayout) {
3504 slack = tableCellOverlapSlack;
3505 } else {
3506 slack = 0;
3507 }
3508
3509 // assign x positions
3510 columns->sort(&TextColumn::cmpX);
3511 for (i = 0; i < columns->getLength(); ++i) {
3512 col = (TextColumn *)columns->get(i);
3513 if (control.fixedPitch) {
3514 col->px = (int)(col->xMin / control.fixedPitch);
3515 } else {
3516 col->px = 0;
3517 for (j = 0; j < i; ++j) {
3518 col2 = (TextColumn *)columns->get(j);
3519 xOverlap = col2->xMax - col->xMin;
3520 if (xOverlap < slack * (col2->xMax - col2->xMin)) {
3521 if (col2->px + col2->pw + 2 > col->px) {
3522 col->px = col2->px + col2->pw + 2;
3523 }
3524 } else {
3525 yOverlap = (col->yMax < col2->yMax ? col->yMax : col2->yMax) -
3526 (col->yMin > col2->yMin ? col->yMin : col2->yMin);
3527 if (yOverlap > 0 && xOverlap < yOverlap) {
3528 if (col2->px + col2->pw > col->px) {
3529 col->px = col2->px + col2->pw;
3530 }
3531 } else {
3532 if (col2->px > col->px) {
3533 col->px = col2->px;
3534 }
3535 }
3536 }
3537 }
3538 }
3539 }
3540
3541 // assign y positions
3542 ph = 0;
3543 columns->sort(&TextColumn::cmpY);
3544 for (i = 0; i < columns->getLength(); ++i) {
3545 col = (TextColumn *)columns->get(i);
3546 col->py = 0;
3547 for (j = 0; j < i; ++j) {
3548 col2 = (TextColumn *)columns->get(j);
3549 yOverlap = col2->yMax - col->yMin;
3550 if (yOverlap < slack * (col2->yMax - col2->yMin)) {
3551 if (col2->py + col2->ph + 1 > col->py) {
3552 col->py = col2->py + col2->ph + 1;
3553 }
3554 } else {
3555 xOverlap = (col->xMax < col2->xMax ? col->xMax : col2->xMax) -
3556 (col->xMin > col2->xMin ? col->xMin : col2->xMin);
3557 if (xOverlap > 0 && yOverlap < xOverlap) {
3558 if (col2->py + col2->ph > col->py) {
3559 col->py = col2->py + col2->ph;
3560 }
3561 } else {
3562 if (col2->py > col->py) {
3563 col->py = col2->py;
3564 }
3565 }
3566 }
3567 }
3568 if (col->py + col->ph > ph) {
3569 ph = col->py + col->ph;
3570 }
3571 }
3572
3573 return ph;
3574 }
3575
generateUnderlinesAndLinks(GList * columns)3576 void TextPage::generateUnderlinesAndLinks(GList *columns) {
3577 TextColumn *col;
3578 TextParagraph *par;
3579 TextLine *line;
3580 TextWord *word;
3581 TextUnderline *underline;
3582 TextLink *link;
3583 double base, uSlack, ubSlack, hSlack;
3584 int colIdx, parIdx, lineIdx, wordIdx, i;
3585
3586 for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
3587 col = (TextColumn *)columns->get(colIdx);
3588 for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
3589 par = (TextParagraph *)col->paragraphs->get(parIdx);
3590 for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
3591 line = (TextLine *)par->lines->get(lineIdx);
3592 for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
3593 word = (TextWord *)line->words->get(wordIdx);
3594 base = word->getBaseline();
3595 uSlack = underlineSlack * word->fontSize;
3596 ubSlack = underlineBaselineSlack * word->fontSize;
3597 hSlack = hyperlinkSlack * word->fontSize;
3598
3599 //----- handle underlining
3600 for (i = 0; i < underlines->getLength(); ++i) {
3601 underline = (TextUnderline *)underlines->get(i);
3602 if (underline->horiz) {
3603 if (word->rot == 0 || word->rot == 2) {
3604 if (fabs(underline->y0 - base) < ubSlack &&
3605 underline->x0 < word->xMin + uSlack &&
3606 word->xMax - uSlack < underline->x1) {
3607 word->underlined = gTrue;
3608 }
3609 }
3610 } else {
3611 if (word->rot == 1 || word->rot == 3) {
3612 if (fabs(underline->x0 - base) < ubSlack &&
3613 underline->y0 < word->yMin + uSlack &&
3614 word->yMax - uSlack < underline->y1) {
3615 word->underlined = gTrue;
3616 }
3617 }
3618 }
3619 }
3620
3621 //----- handle links
3622 for (i = 0; i < links->getLength(); ++i) {
3623 link = (TextLink *)links->get(i);
3624 if (link->xMin < word->xMin + hSlack &&
3625 word->xMax - hSlack < link->xMax &&
3626 link->yMin < word->yMin + hSlack &&
3627 word->yMax - hSlack < link->yMax) {
3628 word->link = link;
3629 }
3630 }
3631 }
3632 }
3633 }
3634 }
3635 }
3636
3637 //------------------------------------------------------------------------
3638 // TextPage: access
3639 //------------------------------------------------------------------------
3640
findText(Unicode * s,int len,GBool startAtTop,GBool stopAtBottom,GBool startAtLast,GBool stopAtLast,GBool caseSensitive,GBool backward,GBool wholeWord,double * xMin,double * yMin,double * xMax,double * yMax)3641 GBool TextPage::findText(Unicode *s, int len,
3642 GBool startAtTop, GBool stopAtBottom,
3643 GBool startAtLast, GBool stopAtLast,
3644 GBool caseSensitive, GBool backward,
3645 GBool wholeWord,
3646 double *xMin, double *yMin,
3647 double *xMax, double *yMax) {
3648 TextBlock *tree;
3649 TextColumn *column;
3650 TextParagraph *par;
3651 TextLine *line;
3652 Unicode *s2, *txt;
3653 Unicode *p;
3654 double xStart, yStart, xStop, yStop;
3655 double xMin0, yMin0, xMax0, yMax0;
3656 double xMin1, yMin1, xMax1, yMax1;
3657 GBool found;
3658 int txtSize, m, rot, colIdx, parIdx, lineIdx, i, j, k;
3659
3660 //~ need to handle right-to-left text
3661
3662 if (!findCols) {
3663 rot = rotateChars(chars);
3664 if ((tree = splitChars(chars))) {
3665 findCols = buildColumns(tree);
3666 delete tree;
3667 } else {
3668 // no text
3669 findCols = new GList();
3670 }
3671 unrotateChars(chars, rot);
3672 unrotateColumns(findCols, rot);
3673 }
3674
3675 // convert the search string to uppercase
3676 if (!caseSensitive) {
3677 s2 = (Unicode *)gmallocn(len, sizeof(Unicode));
3678 for (i = 0; i < len; ++i) {
3679 s2[i] = unicodeToUpper(s[i]);
3680 }
3681 } else {
3682 s2 = s;
3683 }
3684
3685 txt = NULL;
3686 txtSize = 0;
3687
3688 xStart = yStart = xStop = yStop = 0;
3689 if (startAtLast && haveLastFind) {
3690 xStart = lastFindXMin;
3691 yStart = lastFindYMin;
3692 } else if (!startAtTop) {
3693 xStart = *xMin;
3694 yStart = *yMin;
3695 }
3696 if (stopAtLast && haveLastFind) {
3697 xStop = lastFindXMin;
3698 yStop = lastFindYMin;
3699 } else if (!stopAtBottom) {
3700 xStop = *xMax;
3701 yStop = *yMax;
3702 }
3703
3704 found = gFalse;
3705 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3706 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3707
3708 for (colIdx = backward ? findCols->getLength() - 1 : 0;
3709 backward ? colIdx >= 0 : colIdx < findCols->getLength();
3710 colIdx += backward ? -1 : 1) {
3711 column = (TextColumn *)findCols->get(colIdx);
3712
3713 // check: is the column above the top limit?
3714 if (!startAtTop && (backward ? column->yMin > yStart
3715 : column->yMax < yStart)) {
3716 continue;
3717 }
3718
3719 // check: is the column below the bottom limit?
3720 if (!stopAtBottom && (backward ? column->yMax < yStop
3721 : column->yMin > yStop)) {
3722 continue;
3723 }
3724
3725 for (parIdx = backward ? column->paragraphs->getLength() - 1 : 0;
3726 backward ? parIdx >= 0 : parIdx < column->paragraphs->getLength();
3727 parIdx += backward ? -1 : 1) {
3728 par = (TextParagraph *)column->paragraphs->get(parIdx);
3729
3730 // check: is the paragraph above the top limit?
3731 if (!startAtTop && (backward ? par->yMin > yStart
3732 : par->yMax < yStart)) {
3733 continue;
3734 }
3735
3736 // check: is the paragraph below the bottom limit?
3737 if (!stopAtBottom && (backward ? par->yMax < yStop
3738 : par->yMin > yStop)) {
3739 continue;
3740 }
3741
3742 for (lineIdx = backward ? par->lines->getLength() - 1 : 0;
3743 backward ? lineIdx >= 0 : lineIdx < par->lines->getLength();
3744 lineIdx += backward ? -1 : 1) {
3745 line = (TextLine *)par->lines->get(lineIdx);
3746
3747 // check: is the line above the top limit?
3748 if (!startAtTop && (backward ? line->yMin > yStart
3749 : line->yMax < yStart)) {
3750 continue;
3751 }
3752
3753 // check: is the line below the bottom limit?
3754 if (!stopAtBottom && (backward ? line->yMax < yStop
3755 : line->yMin > yStop)) {
3756 continue;
3757 }
3758
3759 // convert the line to uppercase
3760 m = line->len;
3761 if (!caseSensitive) {
3762 if (m > txtSize) {
3763 txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
3764 txtSize = m;
3765 }
3766 for (k = 0; k < m; ++k) {
3767 txt[k] = unicodeToUpper(line->text[k]);
3768 }
3769 } else {
3770 txt = line->text;
3771 }
3772
3773 // search each position in this line
3774 j = backward ? m - len : 0;
3775 p = txt + j;
3776 while (backward ? j >= 0 : j <= m - len) {
3777 if (!wholeWord ||
3778 ((j == 0 || !unicodeTypeWord(txt[j - 1])) &&
3779 (j + len == m || !unicodeTypeWord(txt[j + len])))) {
3780
3781 // compare the strings
3782 for (k = 0; k < len; ++k) {
3783 if (p[k] != s2[k]) {
3784 break;
3785 }
3786 }
3787
3788 // found it
3789 if (k == len) {
3790 switch (line->rot) {
3791 case 0:
3792 xMin1 = line->edge[j];
3793 xMax1 = line->edge[j + len];
3794 yMin1 = line->yMin;
3795 yMax1 = line->yMax;
3796 break;
3797 case 1:
3798 xMin1 = line->xMin;
3799 xMax1 = line->xMax;
3800 yMin1 = line->edge[j];
3801 yMax1 = line->edge[j + len];
3802 break;
3803 case 2:
3804 xMin1 = line->edge[j + len];
3805 xMax1 = line->edge[j];
3806 yMin1 = line->yMin;
3807 yMax1 = line->yMax;
3808 break;
3809 case 3:
3810 xMin1 = line->xMin;
3811 xMax1 = line->xMax;
3812 yMin1 = line->edge[j + len];
3813 yMax1 = line->edge[j];
3814 break;
3815 }
3816 if (backward) {
3817 if ((startAtTop ||
3818 yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) &&
3819 (stopAtBottom ||
3820 yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
3821 if (!found ||
3822 yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
3823 xMin0 = xMin1;
3824 xMax0 = xMax1;
3825 yMin0 = yMin1;
3826 yMax0 = yMax1;
3827 found = gTrue;
3828 }
3829 }
3830 } else {
3831 if ((startAtTop ||
3832 yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
3833 (stopAtBottom ||
3834 yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
3835 if (!found ||
3836 yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
3837 xMin0 = xMin1;
3838 xMax0 = xMax1;
3839 yMin0 = yMin1;
3840 yMax0 = yMax1;
3841 found = gTrue;
3842 }
3843 }
3844 }
3845 }
3846 }
3847 if (backward) {
3848 --j;
3849 --p;
3850 } else {
3851 ++j;
3852 ++p;
3853 }
3854 }
3855 }
3856 }
3857 }
3858
3859 if (!caseSensitive) {
3860 gfree(s2);
3861 gfree(txt);
3862 }
3863
3864 if (found) {
3865 *xMin = xMin0;
3866 *xMax = xMax0;
3867 *yMin = yMin0;
3868 *yMax = yMax0;
3869 lastFindXMin = xMin0;
3870 lastFindYMin = yMin0;
3871 haveLastFind = gTrue;
3872 return gTrue;
3873 }
3874
3875 return gFalse;
3876 }
3877
getText(double xMin,double yMin,double xMax,double yMax)3878 GString *TextPage::getText(double xMin, double yMin,
3879 double xMax, double yMax) {
3880 UnicodeMap *uMap;
3881 char space[8], eol[16];
3882 int spaceLen, eolLen;
3883 GList *chars2;
3884 GString **out;
3885 int *outLen;
3886 TextColumn *col;
3887 TextParagraph *par;
3888 TextLine *line;
3889 TextChar *ch;
3890 GBool primaryLR;
3891 TextBlock *tree;
3892 GList *columns;
3893 GString *ret;
3894 double xx, yy;
3895 int rot, colIdx, parIdx, lineIdx, ph, y, i;
3896
3897 // get the output encoding
3898 if (!(uMap = globalParams->getTextEncoding())) {
3899 return NULL;
3900 }
3901 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3902 eolLen = 0; // make gcc happy
3903 switch (globalParams->getTextEOL()) {
3904 case eolUnix:
3905 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3906 break;
3907 case eolDOS:
3908 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3909 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3910 break;
3911 case eolMac:
3912 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3913 break;
3914 }
3915
3916 // get all chars in the rectangle
3917 // (i.e., all chars whose center lies inside the rectangle)
3918 chars2 = new GList();
3919 for (i = 0; i < chars->getLength(); ++i) {
3920 ch = (TextChar *)chars->get(i);
3921 xx = 0.5 * (ch->xMin + ch->xMax);
3922 yy = 0.5 * (ch->yMin + ch->yMax);
3923 if (xx > xMin && xx < xMax && yy > yMin && yy < yMax) {
3924 chars2->append(ch);
3925 }
3926 }
3927 #if 0 //~debug
3928 dumpChars(chars2);
3929 #endif
3930
3931 rot = rotateChars(chars2);
3932 primaryLR = checkPrimaryLR(chars2);
3933 tree = splitChars(chars2);
3934 if (!tree) {
3935 unrotateChars(chars2, rot);
3936 delete chars2;
3937 return new GString();
3938 }
3939 #if 0 //~debug
3940 dumpTree(tree);
3941 #endif
3942 columns = buildColumns(tree);
3943 delete tree;
3944 ph = assignPhysLayoutPositions(columns);
3945 #if 0 //~debug
3946 dumpColumns(columns);
3947 #endif
3948 unrotateChars(chars2, rot);
3949 delete chars2;
3950
3951 out = (GString **)gmallocn(ph, sizeof(GString *));
3952 outLen = (int *)gmallocn(ph, sizeof(int));
3953 for (i = 0; i < ph; ++i) {
3954 out[i] = NULL;
3955 outLen[i] = 0;
3956 }
3957
3958 columns->sort(&TextColumn::cmpPX);
3959 for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
3960 col = (TextColumn *)columns->get(colIdx);
3961 y = col->py;
3962 for (parIdx = 0;
3963 parIdx < col->paragraphs->getLength() && y < ph;
3964 ++parIdx) {
3965 par = (TextParagraph *)col->paragraphs->get(parIdx);
3966 for (lineIdx = 0;
3967 lineIdx < par->lines->getLength() && y < ph;
3968 ++lineIdx) {
3969 line = (TextLine *)par->lines->get(lineIdx);
3970 if (!out[y]) {
3971 out[y] = new GString();
3972 }
3973 while (outLen[y] < col->px + line->px) {
3974 out[y]->append(space, spaceLen);
3975 ++outLen[y];
3976 }
3977 encodeFragment(line->text, line->len, uMap, primaryLR, out[y]);
3978 outLen[y] += line->pw;
3979 ++y;
3980 }
3981 if (parIdx + 1 < col->paragraphs->getLength()) {
3982 ++y;
3983 }
3984 }
3985 }
3986
3987 ret = new GString();
3988 for (i = 0; i < ph; ++i) {
3989 if (out[i]) {
3990 ret->append(out[i]);
3991 delete out[i];
3992 }
3993 if (ph > 1) {
3994 ret->append(eol, eolLen);
3995 }
3996 }
3997
3998 gfree(out);
3999 gfree(outLen);
4000 deleteGList(columns, TextColumn);
4001 uMap->decRefCnt();
4002
4003 return ret;
4004 }
4005
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax)4006 GBool TextPage::findCharRange(int pos, int length,
4007 double *xMin, double *yMin,
4008 double *xMax, double *yMax) {
4009 TextChar *ch;
4010 double xMin2, yMin2, xMax2, yMax2;
4011 GBool first;
4012 int i;
4013
4014 //~ this doesn't correctly handle ranges split across multiple lines
4015 //~ (the highlighted region is the bounding box of all the parts of
4016 //~ the range)
4017
4018 xMin2 = yMin2 = xMax2 = yMax2 = 0;
4019 first = gTrue;
4020 for (i = 0; i < chars->getLength(); ++i) {
4021 ch = (TextChar *)chars->get(i);
4022 if (ch->charPos >= pos && ch->charPos < pos + length) {
4023 if (first || ch->xMin < xMin2) {
4024 xMin2 = ch->xMin;
4025 }
4026 if (first || ch->yMin < yMin2) {
4027 yMin2 = ch->yMin;
4028 }
4029 if (first || ch->xMax > xMax2) {
4030 xMax2 = ch->xMax;
4031 }
4032 if (first || ch->yMax > yMax2) {
4033 yMax2 = ch->yMax;
4034 }
4035 first = gFalse;
4036 }
4037 }
4038 if (first) {
4039 return gFalse;
4040 }
4041 *xMin = xMin2;
4042 *yMin = yMin2;
4043 *xMax = xMax2;
4044 *yMax = yMax2;
4045 return gTrue;
4046 }
4047
makeWordList()4048 TextWordList *TextPage::makeWordList() {
4049 TextBlock *tree;
4050 GList *columns;
4051 TextColumn *col;
4052 TextParagraph *par;
4053 TextLine *line;
4054 TextWord *word;
4055 GList *words;
4056 int rot, colIdx, parIdx, lineIdx, wordIdx;
4057
4058 rot = rotateChars(chars);
4059 tree = splitChars(chars);
4060 if (!tree) {
4061 // no text
4062 unrotateChars(chars, rot);
4063 return new TextWordList(new GList());
4064 }
4065 columns = buildColumns(tree);
4066 delete tree;
4067 unrotateChars(chars, rot);
4068 if (control.html) {
4069 rotateUnderlinesAndLinks(rot);
4070 generateUnderlinesAndLinks(columns);
4071 }
4072
4073 words = new GList();
4074 for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
4075 col = (TextColumn *)columns->get(colIdx);
4076 for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
4077 par = (TextParagraph *)col->paragraphs->get(parIdx);
4078 for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
4079 line = (TextLine *)par->lines->get(lineIdx);
4080 for (wordIdx = 0; wordIdx < line->words->getLength(); ++wordIdx) {
4081 word = (TextWord *)line->words->get(wordIdx);
4082 words->append(word->copy());
4083 }
4084 }
4085 }
4086 }
4087
4088 switch (control.mode) {
4089 case textOutReadingOrder:
4090 // already in reading order
4091 break;
4092 case textOutPhysLayout:
4093 case textOutTableLayout:
4094 case textOutLinePrinter:
4095 words->sort(&TextWord::cmpYX);
4096 break;
4097 case textOutRawOrder:
4098 words->sort(&TextWord::cmpCharPos);
4099 break;
4100 }
4101
4102 // this has to be done after sorting with cmpYX
4103 unrotateColumns(columns, rot);
4104 unrotateWords(words, rot);
4105
4106 deleteGList(columns, TextColumn);
4107
4108 return new TextWordList(words);
4109 }
4110
4111 //------------------------------------------------------------------------
4112 // TextPage: debug
4113 //------------------------------------------------------------------------
4114
4115 #if 0 //~debug
4116
4117 void TextPage::dumpChars(GList *charsA) {
4118 TextChar *ch;
4119 int i;
4120
4121 for (i = 0; i < charsA->getLength(); ++i) {
4122 ch = (TextChar *)charsA->get(i);
4123 printf("char: U+%04x '%c' xMin=%g yMin=%g xMax=%g yMax=%g fontSize=%g rot=%d\n",
4124 ch->c, ch->c & 0xff, ch->xMin, ch->yMin, ch->xMax, ch->yMax,
4125 ch->fontSize, ch->rot);
4126 }
4127 }
4128
4129 void TextPage::dumpTree(TextBlock *tree, int indent) {
4130 TextChar *ch;
4131 int i;
4132
4133 printf("%*sblock: type=%s tag=%s small=%d rot=%d xMin=%g yMin=%g xMax=%g yMax=%g\n",
4134 indent, "",
4135 tree->type == blkLeaf ? "leaf" :
4136 tree->type == blkHorizSplit ? "horiz" : "vert",
4137 tree->tag == blkTagMulticolumn ? "multicolumn" :
4138 tree->tag == blkTagColumn ? "column" : "line",
4139 tree->smallSplit,
4140 tree->rot, tree->xMin, tree->yMin, tree->xMax, tree->yMax);
4141 if (tree->type == blkLeaf) {
4142 for (i = 0; i < tree->children->getLength(); ++i) {
4143 ch = (TextChar *)tree->children->get(i);
4144 printf("%*schar: '%c' xMin=%g yMin=%g xMax=%g yMax=%g font=%d.%d\n",
4145 indent + 2, "", ch->c & 0xff,
4146 ch->xMin, ch->yMin, ch->xMax, ch->yMax,
4147 ch->font->fontID.num, ch->font->fontID.gen);
4148 }
4149 } else {
4150 for (i = 0; i < tree->children->getLength(); ++i) {
4151 dumpTree((TextBlock *)tree->children->get(i), indent + 2);
4152 }
4153 }
4154 }
4155
4156 void TextPage::dumpColumns(GList *columns) {
4157 TextColumn *col;
4158 TextParagraph *par;
4159 TextLine *line;
4160 int colIdx, parIdx, lineIdx, i;
4161
4162 for (colIdx = 0; colIdx < columns->getLength(); ++colIdx) {
4163 col = (TextColumn *)columns->get(colIdx);
4164 printf("column: xMin=%g yMin=%g xMax=%g yMax=%g px=%d py=%d pw=%d ph=%d\n",
4165 col->xMin, col->yMin, col->xMax, col->yMax,
4166 col->px, col->py, col->pw, col->ph);
4167 for (parIdx = 0; parIdx < col->paragraphs->getLength(); ++parIdx) {
4168 par = (TextParagraph *)col->paragraphs->get(parIdx);
4169 printf(" paragraph:\n");
4170 for (lineIdx = 0; lineIdx < par->lines->getLength(); ++lineIdx) {
4171 line = (TextLine *)par->lines->get(lineIdx);
4172 printf(" line: xMin=%g yMin=%g xMax=%g yMax=%g px=%d pw=%d rot=%d\n",
4173 line->xMin, line->yMin, line->xMax, line->yMax,
4174 line->px, line->pw, line->rot);
4175 printf(" ");
4176 for (i = 0; i < line->len; ++i) {
4177 printf("%c", line->text[i] & 0xff);
4178 }
4179 printf("\n");
4180 }
4181 }
4182 }
4183 }
4184
4185 #endif //~debug
4186
4187 //------------------------------------------------------------------------
4188 // TextOutputDev
4189 //------------------------------------------------------------------------
4190
outputToFile(void * stream,const char * text,int len)4191 static void outputToFile(void *stream, const char *text, int len) {
4192 fwrite(text, 1, len, (FILE *)stream);
4193 }
4194
TextOutputDev(char * fileName,TextOutputControl * controlA,GBool append)4195 TextOutputDev::TextOutputDev(char *fileName, TextOutputControl *controlA,
4196 GBool append) {
4197 text = NULL;
4198 control = *controlA;
4199 ok = gTrue;
4200
4201 // open file
4202 needClose = gFalse;
4203 if (fileName) {
4204 if (!strcmp(fileName, "-")) {
4205 outputStream = stdout;
4206 #ifdef WIN32
4207 // keep DOS from munging the end-of-line characters
4208 setmode(fileno(stdout), O_BINARY);
4209 #endif
4210 } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
4211 needClose = gTrue;
4212 } else {
4213 error(errIO, -1, "Couldn't open text file '{0:s}'", fileName);
4214 ok = gFalse;
4215 return;
4216 }
4217 outputFunc = &outputToFile;
4218 } else {
4219 outputStream = NULL;
4220 }
4221
4222 // set up text object
4223 text = new TextPage(&control);
4224 }
4225
TextOutputDev(TextOutputFunc func,void * stream,TextOutputControl * controlA)4226 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
4227 TextOutputControl *controlA) {
4228 outputFunc = func;
4229 outputStream = stream;
4230 needClose = gFalse;
4231 control = *controlA;
4232 text = new TextPage(&control);
4233 ok = gTrue;
4234 }
4235
~TextOutputDev()4236 TextOutputDev::~TextOutputDev() {
4237 if (needClose) {
4238 fclose((FILE *)outputStream);
4239 }
4240 if (text) {
4241 delete text;
4242 }
4243 }
4244
startPage(int pageNum,GfxState * state)4245 void TextOutputDev::startPage(int pageNum, GfxState *state) {
4246 text->startPage(state);
4247 }
4248
endPage()4249 void TextOutputDev::endPage() {
4250 if (outputStream) {
4251 text->write(outputStream, outputFunc);
4252 }
4253 }
4254
restoreState(GfxState * state)4255 void TextOutputDev::restoreState(GfxState *state) {
4256 text->updateFont(state);
4257 }
4258
updateFont(GfxState * state)4259 void TextOutputDev::updateFont(GfxState *state) {
4260 text->updateFont(state);
4261 }
4262
beginString(GfxState * state,GString * s)4263 void TextOutputDev::beginString(GfxState *state, GString *s) {
4264 }
4265
endString(GfxState * state)4266 void TextOutputDev::endString(GfxState *state) {
4267 }
4268
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode c,int nBytes,Unicode * u,int uLen)4269 void TextOutputDev::drawChar(GfxState *state, double x, double y,
4270 double dx, double dy,
4271 double originX, double originY,
4272 CharCode c, int nBytes, Unicode *u, int uLen) {
4273 text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
4274 }
4275
incCharCount(int nChars)4276 void TextOutputDev::incCharCount(int nChars) {
4277 text->incCharCount(nChars);
4278 }
4279
beginActualText(GfxState * state,Unicode * u,int uLen)4280 void TextOutputDev::beginActualText(GfxState *state, Unicode *u, int uLen) {
4281 text->beginActualText(state, u, uLen);
4282 }
4283
endActualText(GfxState * state)4284 void TextOutputDev::endActualText(GfxState *state) {
4285 text->endActualText(state);
4286 }
4287
stroke(GfxState * state)4288 void TextOutputDev::stroke(GfxState *state) {
4289 GfxPath *path;
4290 GfxSubpath *subpath;
4291 double x[2], y[2];
4292
4293 if (!control.html) {
4294 return;
4295 }
4296 path = state->getPath();
4297 if (path->getNumSubpaths() != 1) {
4298 return;
4299 }
4300 subpath = path->getSubpath(0);
4301 if (subpath->getNumPoints() != 2) {
4302 return;
4303 }
4304 state->transform(subpath->getX(0), subpath->getY(0), &x[0], &y[0]);
4305 state->transform(subpath->getX(1), subpath->getY(1), &x[1], &y[1]);
4306
4307 // look for a vertical or horizontal line
4308 if (x[0] == x[1] || y[0] == y[1]) {
4309 text->addUnderline(x[0], y[0], x[1], y[1]);
4310 }
4311 }
4312
fill(GfxState * state)4313 void TextOutputDev::fill(GfxState *state) {
4314 GfxPath *path;
4315 GfxSubpath *subpath;
4316 double x[5], y[5];
4317 double rx0, ry0, rx1, ry1, t;
4318 int i;
4319
4320 if (!control.html) {
4321 return;
4322 }
4323 path = state->getPath();
4324 if (path->getNumSubpaths() != 1) {
4325 return;
4326 }
4327 subpath = path->getSubpath(0);
4328 if (subpath->getNumPoints() != 5) {
4329 return;
4330 }
4331 for (i = 0; i < 5; ++i) {
4332 if (subpath->getCurve(i)) {
4333 return;
4334 }
4335 state->transform(subpath->getX(i), subpath->getY(i), &x[i], &y[i]);
4336 }
4337
4338 // look for a rectangle
4339 if (x[0] == x[1] && y[1] == y[2] && x[2] == x[3] && y[3] == y[4] &&
4340 x[0] == x[4] && y[0] == y[4]) {
4341 rx0 = x[0];
4342 ry0 = y[0];
4343 rx1 = x[2];
4344 ry1 = y[1];
4345 } else if (y[0] == y[1] && x[1] == x[2] && y[2] == y[3] && x[3] == x[4] &&
4346 x[0] == x[4] && y[0] == y[4]) {
4347 rx0 = x[0];
4348 ry0 = y[0];
4349 rx1 = x[1];
4350 ry1 = y[2];
4351 } else {
4352 return;
4353 }
4354 if (rx1 < rx0) {
4355 t = rx0;
4356 rx0 = rx1;
4357 rx1 = t;
4358 }
4359 if (ry1 < ry0) {
4360 t = ry0;
4361 ry0 = ry1;
4362 ry1 = t;
4363 }
4364
4365 // skinny horizontal rectangle
4366 if (ry1 - ry0 < rx1 - rx0) {
4367 if (ry1 - ry0 < maxUnderlineWidth) {
4368 ry0 = 0.5 * (ry0 + ry1);
4369 text->addUnderline(rx0, ry0, rx1, ry0);
4370 }
4371
4372 // skinny vertical rectangle
4373 } else {
4374 if (rx1 - rx0 < maxUnderlineWidth) {
4375 rx0 = 0.5 * (rx0 + rx1);
4376 text->addUnderline(rx0, ry0, rx0, ry1);
4377 }
4378 }
4379 }
4380
eoFill(GfxState * state)4381 void TextOutputDev::eoFill(GfxState *state) {
4382 if (!control.html) {
4383 return;
4384 }
4385 fill(state);
4386 }
4387
processLink(Link * link)4388 void TextOutputDev::processLink(Link *link) {
4389 double x1, y1, x2, y2;
4390 int xMin, yMin, xMax, yMax, x, y;
4391
4392 if (!control.html) {
4393 return;
4394 }
4395 link->getRect(&x1, &y1, &x2, &y2);
4396 cvtUserToDev(x1, y1, &x, &y);
4397 xMin = xMax = x;
4398 yMin = yMax = y;
4399 cvtUserToDev(x1, y2, &x, &y);
4400 if (x < xMin) {
4401 xMin = x;
4402 } else if (x > xMax) {
4403 xMax = x;
4404 }
4405 if (y < yMin) {
4406 yMin = y;
4407 } else if (y > yMax) {
4408 yMax = y;
4409 }
4410 cvtUserToDev(x2, y1, &x, &y);
4411 if (x < xMin) {
4412 xMin = x;
4413 } else if (x > xMax) {
4414 xMax = x;
4415 }
4416 if (y < yMin) {
4417 yMin = y;
4418 } else if (y > yMax) {
4419 yMax = y;
4420 }
4421 cvtUserToDev(x2, y2, &x, &y);
4422 if (x < xMin) {
4423 xMin = x;
4424 } else if (x > xMax) {
4425 xMax = x;
4426 }
4427 if (y < yMin) {
4428 yMin = y;
4429 } else if (y > yMax) {
4430 yMax = y;
4431 }
4432 text->addLink(xMin, yMin, xMax, yMax, link);
4433 }
4434
findText(Unicode * s,int len,GBool startAtTop,GBool stopAtBottom,GBool startAtLast,GBool stopAtLast,GBool caseSensitive,GBool backward,GBool wholeWord,double * xMin,double * yMin,double * xMax,double * yMax)4435 GBool TextOutputDev::findText(Unicode *s, int len,
4436 GBool startAtTop, GBool stopAtBottom,
4437 GBool startAtLast, GBool stopAtLast,
4438 GBool caseSensitive, GBool backward,
4439 GBool wholeWord,
4440 double *xMin, double *yMin,
4441 double *xMax, double *yMax) {
4442 return text->findText(s, len, startAtTop, stopAtBottom,
4443 startAtLast, stopAtLast,
4444 caseSensitive, backward, wholeWord,
4445 xMin, yMin, xMax, yMax);
4446 }
4447
getText(double xMin,double yMin,double xMax,double yMax)4448 GString *TextOutputDev::getText(double xMin, double yMin,
4449 double xMax, double yMax) {
4450 return text->getText(xMin, yMin, xMax, yMax);
4451 }
4452
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax)4453 GBool TextOutputDev::findCharRange(int pos, int length,
4454 double *xMin, double *yMin,
4455 double *xMax, double *yMax) {
4456 return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
4457 }
4458
makeWordList()4459 TextWordList *TextOutputDev::makeWordList() {
4460 return text->makeWordList();
4461 }
4462
takeText()4463 TextPage *TextOutputDev::takeText() {
4464 TextPage *ret;
4465
4466 ret = text;
4467 text = new TextPage(&control);
4468 return ret;
4469 }
4470