1 //========================================================================
2 //
3 // TextOutputDev.cc
4 //
5 // Copyright 1997-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8
9 #include <aconf.h>
10
11 #ifdef USE_GCC_PRAGMAS
12 #pragma implementation
13 #endif
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <stddef.h>
18 #include <math.h>
19 #include <ctype.h>
20 #ifdef WIN32
21 #include <fcntl.h> // for O_BINARY
22 #include <io.h> // for setmode
23 #endif
24 #include "gmem.h"
25 #include "GString.h"
26 #include "GList.h"
27 #include "config.h"
28 #include "Error.h"
29 #include "GlobalParams.h"
30 #include "UnicodeMap.h"
31 #include "UnicodeTypeTable.h"
32 #include "GfxState.h"
33 #include "TextOutputDev.h"
34
35 #ifdef MACOS
36 // needed for setting type/creator of MacOS files
37 #include "ICSupport.h"
38 #endif
39
40 //------------------------------------------------------------------------
41 // parameters
42 //------------------------------------------------------------------------
43
44 // Each bucket in a text pool includes baselines within a range of
45 // this many points.
46 #define textPoolStep 4
47
48 // Inter-character space width which will cause addChar to start a new
49 // word.
50 #define minWordBreakSpace 0.1
51
52 // Negative inter-character space width, i.e., overlap, which will
53 // cause addChar to start a new word.
54 #define minDupBreakOverlap 0.2
55
56 // Max distance between baselines of two lines within a block, as a
57 // fraction of the font size.
58 #define maxLineSpacingDelta 1.5
59
60 // Max difference in primary font sizes on two lines in the same
61 // block. Delta1 is used when examining new lines above and below the
62 // current block; delta2 is used when examining text that overlaps the
63 // current block; delta3 is used when examining text to the left and
64 // right of the current block.
65 #define maxBlockFontSizeDelta1 0.05
66 #define maxBlockFontSizeDelta2 0.6
67 #define maxBlockFontSizeDelta3 0.2
68
69 // Max difference in font sizes inside a word.
70 #define maxWordFontSizeDelta 0.05
71
72 // Maximum distance between baselines of two words on the same line,
73 // e.g., distance between subscript or superscript and the primary
74 // baseline, as a fraction of the font size.
75 #define maxIntraLineDelta 0.5
76
77 // Minimum inter-word spacing, as a fraction of the font size. (Only
78 // used for raw ordering.)
79 #define minWordSpacing 0.15
80
81 // Maximum inter-word spacing, as a fraction of the font size.
82 #define maxWordSpacing 1.5
83
84 // Maximum horizontal spacing which will allow a word to be pulled
85 // into a block.
86 #define minColSpacing1 0.3
87
88 // Minimum spacing between columns, as a fraction of the font size.
89 #define minColSpacing2 1.0
90
91 // Maximum vertical spacing between blocks within a flow, as a
92 // multiple of the font size.
93 #define maxBlockSpacing 2.5
94
95 // Minimum spacing between characters within a word, as a fraction of
96 // the font size.
97 #define minCharSpacing -0.2
98
99 // Maximum spacing between characters within a word, as a fraction of
100 // the font size, when there is no obvious extra-wide character
101 // spacing.
102 #define maxCharSpacing 0.03
103
104 // When extra-wide character spacing is detected, the inter-character
105 // space threshold is set to the minimum inter-character space
106 // multiplied by this constant.
107 #define maxWideCharSpacingMul 1.3
108
109 // Max difference in primary,secondary coordinates (as a fraction of
110 // the font size) allowed for duplicated text (fake boldface, drop
111 // shadows) which is to be discarded.
112 #define dupMaxPriDelta 0.1
113 #define dupMaxSecDelta 0.2
114
115 //------------------------------------------------------------------------
116 // TextFontInfo
117 //------------------------------------------------------------------------
118
TextFontInfo(GfxState * state)119 TextFontInfo::TextFontInfo(GfxState *state) {
120 gfxFont = state->getFont();
121 #if TEXTOUT_WORD_LIST
122 fontName = (gfxFont && gfxFont->getOrigName())
123 ? gfxFont->getOrigName()->copy()
124 : (GString *)NULL;
125 #endif
126 }
127
~TextFontInfo()128 TextFontInfo::~TextFontInfo() {
129 #if TEXTOUT_WORD_LIST
130 if (fontName) {
131 delete fontName;
132 }
133 #endif
134 }
135
matches(GfxState * state)136 GBool TextFontInfo::matches(GfxState *state) {
137 return state->getFont() == gfxFont;
138 }
139
140 //------------------------------------------------------------------------
141 // TextWord
142 //------------------------------------------------------------------------
143
TextWord(GfxState * state,int rotA,double x0,double y0,int charPosA,TextFontInfo * fontA,double fontSizeA)144 TextWord::TextWord(GfxState *state, int rotA, double x0, double y0,
145 int charPosA, TextFontInfo *fontA, double fontSizeA) {
146 GfxFont *gfxFont;
147 double x, y, ascent, descent;
148
149 rot = rotA;
150 charPos = charPosA;
151 charLen = 0;
152 font = fontA;
153 fontSize = fontSizeA;
154 state->transform(x0, y0, &x, &y);
155 if ((gfxFont = font->gfxFont)) {
156 ascent = gfxFont->getAscent() * fontSize;
157 descent = gfxFont->getDescent() * fontSize;
158 } else {
159 // this means that the PDF file draws text without a current font,
160 // which should never happen
161 ascent = 0.95 * fontSize;
162 descent = -0.35 * fontSize;
163 }
164 switch (rot) {
165 case 0:
166 yMin = y - ascent;
167 yMax = y - descent;
168 if (yMin == yMax) {
169 // this is a sanity check for a case that shouldn't happen -- but
170 // if it does happen, we want to avoid dividing by zero later
171 yMin = y;
172 yMax = y + 1;
173 }
174 base = y;
175 break;
176 case 1:
177 xMin = x + descent;
178 xMax = x + ascent;
179 if (xMin == xMax) {
180 // this is a sanity check for a case that shouldn't happen -- but
181 // if it does happen, we want to avoid dividing by zero later
182 xMin = x;
183 xMax = x + 1;
184 }
185 base = x;
186 break;
187 case 2:
188 yMin = y + descent;
189 yMax = y + ascent;
190 if (yMin == yMax) {
191 // this is a sanity check for a case that shouldn't happen -- but
192 // if it does happen, we want to avoid dividing by zero later
193 yMin = y;
194 yMax = y + 1;
195 }
196 base = y;
197 break;
198 case 3:
199 xMin = x - ascent;
200 xMax = x - descent;
201 if (xMin == xMax) {
202 // this is a sanity check for a case that shouldn't happen -- but
203 // if it does happen, we want to avoid dividing by zero later
204 xMin = x;
205 xMax = x + 1;
206 }
207 base = x;
208 break;
209 }
210 text = NULL;
211 edge = NULL;
212 len = size = 0;
213 spaceAfter = gFalse;
214 next = NULL;
215
216 #if TEXTOUT_WORD_LIST
217 GfxRGB rgb;
218
219 if ((state->getRender() & 3) == 1) {
220 state->getStrokeRGB(&rgb);
221 } else {
222 state->getFillRGB(&rgb);
223 }
224 colorR = colToDbl(rgb.r);
225 colorG = colToDbl(rgb.g);
226 colorB = colToDbl(rgb.b);
227 #endif
228 }
229
~TextWord()230 TextWord::~TextWord() {
231 gfree(text);
232 gfree(edge);
233 }
234
addChar(GfxState * state,double x,double y,double dx,double dy,Unicode u)235 void TextWord::addChar(GfxState *state, double x, double y,
236 double dx, double dy, Unicode u) {
237 if (len == size) {
238 size += 16;
239 text = (Unicode *)greallocn(text, size, sizeof(Unicode));
240 edge = (double *)greallocn(edge, size + 1, sizeof(double));
241 }
242 text[len] = u;
243 switch (rot) {
244 case 0:
245 if (len == 0) {
246 xMin = x;
247 }
248 edge[len] = x;
249 xMax = edge[len+1] = x + dx;
250 break;
251 case 1:
252 if (len == 0) {
253 yMin = y;
254 }
255 edge[len] = y;
256 yMax = edge[len+1] = y + dy;
257 break;
258 case 2:
259 if (len == 0) {
260 xMax = x;
261 }
262 edge[len] = x;
263 xMin = edge[len+1] = x + dx;
264 break;
265 case 3:
266 if (len == 0) {
267 yMax = y;
268 }
269 edge[len] = y;
270 yMin = edge[len+1] = y + dy;
271 break;
272 }
273 ++len;
274 }
275
merge(TextWord * word)276 void TextWord::merge(TextWord *word) {
277 int i;
278
279 if (word->xMin < xMin) {
280 xMin = word->xMin;
281 }
282 if (word->yMin < yMin) {
283 yMin = word->yMin;
284 }
285 if (word->xMax > xMax) {
286 xMax = word->xMax;
287 }
288 if (word->yMax > yMax) {
289 yMax = word->yMax;
290 }
291 if (len + word->len > size) {
292 size = len + word->len;
293 text = (Unicode *)greallocn(text, size, sizeof(Unicode));
294 edge = (double *)greallocn(edge, size + 1, sizeof(double));
295 }
296 for (i = 0; i < word->len; ++i) {
297 text[len + i] = word->text[i];
298 edge[len + i] = word->edge[i];
299 }
300 edge[len + word->len] = word->edge[word->len];
301 len += word->len;
302 charLen += word->charLen;
303 }
304
primaryCmp(TextWord * word)305 inline int TextWord::primaryCmp(TextWord *word) {
306 double cmp;
307
308 cmp = 0; // make gcc happy
309 switch (rot) {
310 case 0:
311 cmp = xMin - word->xMin;
312 break;
313 case 1:
314 cmp = yMin - word->yMin;
315 break;
316 case 2:
317 cmp = word->xMax - xMax;
318 break;
319 case 3:
320 cmp = word->yMax - yMax;
321 break;
322 }
323 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
324 }
325
primaryDelta(TextWord * word)326 double TextWord::primaryDelta(TextWord *word) {
327 double delta;
328
329 delta = 0; // make gcc happy
330 switch (rot) {
331 case 0:
332 delta = word->xMin - xMax;
333 break;
334 case 1:
335 delta = word->yMin - yMax;
336 break;
337 case 2:
338 delta = xMin - word->xMax;
339 break;
340 case 3:
341 delta = yMin - word->yMax;
342 break;
343 }
344 return delta;
345 }
346
cmpYX(const void * p1,const void * p2)347 int TextWord::cmpYX(const void *p1, const void *p2) {
348 TextWord *word1 = *(TextWord **)p1;
349 TextWord *word2 = *(TextWord **)p2;
350 double cmp;
351
352 cmp = word1->yMin - word2->yMin;
353 if (cmp == 0) {
354 cmp = word1->xMin - word2->xMin;
355 }
356 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
357 }
358
359 #if TEXTOUT_WORD_LIST
360
getText()361 GString *TextWord::getText() {
362 GString *s;
363 UnicodeMap *uMap;
364 char buf[8];
365 int n, i;
366
367 s = new GString();
368 if (!(uMap = globalParams->getTextEncoding())) {
369 return s;
370 }
371 for (i = 0; i < len; ++i) {
372 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
373 s->append(buf, n);
374 }
375 uMap->decRefCnt();
376 return s;
377 }
378
379 #endif // TEXTOUT_WORD_LIST
380
381 //------------------------------------------------------------------------
382 // TextPool
383 //------------------------------------------------------------------------
384
TextPool()385 TextPool::TextPool() {
386 minBaseIdx = 0;
387 maxBaseIdx = -1;
388 pool = NULL;
389 cursor = NULL;
390 cursorBaseIdx = -1;
391 }
392
~TextPool()393 TextPool::~TextPool() {
394 int baseIdx;
395 TextWord *word, *word2;
396
397 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
398 for (word = pool[baseIdx - minBaseIdx]; word; word = word2) {
399 word2 = word->next;
400 delete word;
401 }
402 }
403 gfree(pool);
404 }
405
getBaseIdx(double base)406 int TextPool::getBaseIdx(double base) {
407 int baseIdx;
408
409 baseIdx = (int)(base / textPoolStep);
410 if (baseIdx < minBaseIdx) {
411 return minBaseIdx;
412 }
413 if (baseIdx > maxBaseIdx) {
414 return maxBaseIdx;
415 }
416 return baseIdx;
417 }
418
addWord(TextWord * word)419 void TextPool::addWord(TextWord *word) {
420 TextWord **newPool;
421 int wordBaseIdx, newMinBaseIdx, newMaxBaseIdx, baseIdx;
422 TextWord *w0, *w1;
423
424 // expand the array if needed
425 wordBaseIdx = (int)(word->base / textPoolStep);
426 if (minBaseIdx > maxBaseIdx) {
427 minBaseIdx = wordBaseIdx - 128;
428 maxBaseIdx = wordBaseIdx + 128;
429 pool = (TextWord **)gmallocn(maxBaseIdx - minBaseIdx + 1,
430 sizeof(TextWord *));
431 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
432 pool[baseIdx - minBaseIdx] = NULL;
433 }
434 } else if (wordBaseIdx < minBaseIdx) {
435 newMinBaseIdx = wordBaseIdx - 128;
436 newPool = (TextWord **)gmallocn(maxBaseIdx - newMinBaseIdx + 1,
437 sizeof(TextWord *));
438 for (baseIdx = newMinBaseIdx; baseIdx < minBaseIdx; ++baseIdx) {
439 newPool[baseIdx - newMinBaseIdx] = NULL;
440 }
441 memcpy(&newPool[minBaseIdx - newMinBaseIdx], pool,
442 (maxBaseIdx - minBaseIdx + 1) * sizeof(TextWord *));
443 gfree(pool);
444 pool = newPool;
445 minBaseIdx = newMinBaseIdx;
446 } else if (wordBaseIdx > maxBaseIdx) {
447 newMaxBaseIdx = wordBaseIdx + 128;
448 pool = (TextWord **)greallocn(pool, newMaxBaseIdx - minBaseIdx + 1,
449 sizeof(TextWord *));
450 for (baseIdx = maxBaseIdx + 1; baseIdx <= newMaxBaseIdx; ++baseIdx) {
451 pool[baseIdx - minBaseIdx] = NULL;
452 }
453 maxBaseIdx = newMaxBaseIdx;
454 }
455
456 // insert the new word
457 if (cursor && wordBaseIdx == cursorBaseIdx &&
458 word->primaryCmp(cursor) > 0) {
459 w0 = cursor;
460 w1 = cursor->next;
461 } else {
462 w0 = NULL;
463 w1 = pool[wordBaseIdx - minBaseIdx];
464 }
465 for (; w1 && word->primaryCmp(w1) > 0; w0 = w1, w1 = w1->next) ;
466 word->next = w1;
467 if (w0) {
468 w0->next = word;
469 } else {
470 pool[wordBaseIdx - minBaseIdx] = word;
471 }
472 cursor = word;
473 cursorBaseIdx = wordBaseIdx;
474 }
475
476 //------------------------------------------------------------------------
477 // TextLine
478 //------------------------------------------------------------------------
479
TextLine(TextBlock * blkA,int rotA,double baseA)480 TextLine::TextLine(TextBlock *blkA, int rotA, double baseA) {
481 blk = blkA;
482 rot = rotA;
483 xMin = yMin = 0;
484 xMax = yMax = -1;
485 base = baseA;
486 words = lastWord = NULL;
487 text = NULL;
488 edge = NULL;
489 col = NULL;
490 len = 0;
491 convertedLen = 0;
492 hyphenated = gFalse;
493 next = NULL;
494 }
495
~TextLine()496 TextLine::~TextLine() {
497 TextWord *word;
498
499 while (words) {
500 word = words;
501 words = words->next;
502 delete word;
503 }
504 gfree(text);
505 gfree(edge);
506 gfree(col);
507 }
508
addWord(TextWord * word)509 void TextLine::addWord(TextWord *word) {
510 if (lastWord) {
511 lastWord->next = word;
512 } else {
513 words = word;
514 }
515 lastWord = word;
516
517 if (xMin > xMax) {
518 xMin = word->xMin;
519 xMax = word->xMax;
520 yMin = word->yMin;
521 yMax = word->yMax;
522 } else {
523 if (word->xMin < xMin) {
524 xMin = word->xMin;
525 }
526 if (word->xMax > xMax) {
527 xMax = word->xMax;
528 }
529 if (word->yMin < yMin) {
530 yMin = word->yMin;
531 }
532 if (word->yMax > yMax) {
533 yMax = word->yMax;
534 }
535 }
536 }
537
primaryDelta(TextLine * line)538 double TextLine::primaryDelta(TextLine *line) {
539 double delta;
540
541 delta = 0; // make gcc happy
542 switch (rot) {
543 case 0:
544 delta = line->xMin - xMax;
545 break;
546 case 1:
547 delta = line->yMin - yMax;
548 break;
549 case 2:
550 delta = xMin - line->xMax;
551 break;
552 case 3:
553 delta = yMin - line->yMax;
554 break;
555 }
556 return delta;
557 }
558
primaryCmp(TextLine * line)559 int TextLine::primaryCmp(TextLine *line) {
560 double cmp;
561
562 cmp = 0; // make gcc happy
563 switch (rot) {
564 case 0:
565 cmp = xMin - line->xMin;
566 break;
567 case 1:
568 cmp = yMin - line->yMin;
569 break;
570 case 2:
571 cmp = line->xMax - xMax;
572 break;
573 case 3:
574 cmp = line->yMax - yMax;
575 break;
576 }
577 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
578 }
579
secondaryCmp(TextLine * line)580 int TextLine::secondaryCmp(TextLine *line) {
581 double cmp;
582
583 cmp = (rot == 0 || rot == 3) ? base - line->base : line->base - base;
584 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
585 }
586
cmpYX(TextLine * line)587 int TextLine::cmpYX(TextLine *line) {
588 int cmp;
589
590 if ((cmp = secondaryCmp(line))) {
591 return cmp;
592 }
593 return primaryCmp(line);
594 }
595
cmpXY(const void * p1,const void * p2)596 int TextLine::cmpXY(const void *p1, const void *p2) {
597 TextLine *line1 = *(TextLine **)p1;
598 TextLine *line2 = *(TextLine **)p2;
599 int cmp;
600
601 if ((cmp = line1->primaryCmp(line2))) {
602 return cmp;
603 }
604 return line1->secondaryCmp(line2);
605 }
606
coalesce(UnicodeMap * uMap)607 void TextLine::coalesce(UnicodeMap *uMap) {
608 TextWord *word0, *word1;
609 double space, delta, minSpace;
610 GBool isUnicode;
611 char buf[8];
612 int i, j;
613
614 if (words->next) {
615
616 // compute the inter-word space threshold
617 if (words->len > 1 || words->next->len > 1) {
618 minSpace = 0;
619 } else {
620 minSpace = words->primaryDelta(words->next);
621 for (word0 = words->next, word1 = word0->next;
622 word1 && minSpace > 0;
623 word0 = word1, word1 = word0->next) {
624 if (word1->len > 1) {
625 minSpace = 0;
626 }
627 delta = word0->primaryDelta(word1);
628 if (delta < minSpace) {
629 minSpace = delta;
630 }
631 }
632 }
633 if (minSpace <= 0) {
634 space = maxCharSpacing * words->fontSize;
635 } else {
636 space = maxWideCharSpacingMul * minSpace;
637 }
638
639 // merge words
640 word0 = words;
641 word1 = words->next;
642 while (word1) {
643 if (word0->primaryDelta(word1) >= space) {
644 word0->spaceAfter = gTrue;
645 word0 = word1;
646 word1 = word1->next;
647 } else if (word0->font == word1->font &&
648 fabs(word0->fontSize - word1->fontSize) <
649 maxWordFontSizeDelta * words->fontSize &&
650 word1->charPos == word0->charPos + word0->charLen) {
651 word0->merge(word1);
652 word0->next = word1->next;
653 delete word1;
654 word1 = word0->next;
655 } else {
656 word0 = word1;
657 word1 = word1->next;
658 }
659 }
660 }
661
662 // build the line text
663 isUnicode = uMap ? uMap->isUnicode() : gFalse;
664 len = 0;
665 for (word1 = words; word1; word1 = word1->next) {
666 len += word1->len;
667 if (word1->spaceAfter) {
668 ++len;
669 }
670 }
671 text = (Unicode *)gmallocn(len, sizeof(Unicode));
672 edge = (double *)gmallocn(len + 1, sizeof(double));
673 i = 0;
674 for (word1 = words; word1; word1 = word1->next) {
675 for (j = 0; j < word1->len; ++j) {
676 text[i] = word1->text[j];
677 edge[i] = word1->edge[j];
678 ++i;
679 }
680 edge[i] = word1->edge[word1->len];
681 if (word1->spaceAfter) {
682 text[i] = (Unicode)0x0020;
683 ++i;
684 }
685 }
686
687 // compute convertedLen and set up the col array
688 col = (int *)gmallocn(len + 1, sizeof(int));
689 convertedLen = 0;
690 for (i = 0; i < len; ++i) {
691 col[i] = convertedLen;
692 if (isUnicode) {
693 ++convertedLen;
694 } else if (uMap) {
695 convertedLen += uMap->mapUnicode(text[i], buf, sizeof(buf));
696 }
697 }
698 col[len] = convertedLen;
699
700 // check for hyphen at end of line
701 //~ need to check for other chars used as hyphens
702 hyphenated = text[len - 1] == (Unicode)'-';
703 }
704
705 //------------------------------------------------------------------------
706 // TextLineFrag
707 //------------------------------------------------------------------------
708
709 class TextLineFrag {
710 public:
711
712 TextLine *line; // the line object
713 int start, len; // offset and length of this fragment
714 // (in Unicode chars)
715 double xMin, xMax; // bounding box coordinates
716 double yMin, yMax;
717 double base; // baseline virtual coordinate
718 int col; // first column
719
720 void init(TextLine *lineA, int startA, int lenA);
721 void computeCoords(GBool oneRot);
722
723 static int cmpYXPrimaryRot(const void *p1, const void *p2);
724 static int cmpYXLineRot(const void *p1, const void *p2);
725 static int cmpXYLineRot(const void *p1, const void *p2);
726 };
727
init(TextLine * lineA,int startA,int lenA)728 void TextLineFrag::init(TextLine *lineA, int startA, int lenA) {
729 line = lineA;
730 start = startA;
731 len = lenA;
732 col = line->col[start];
733 }
734
computeCoords(GBool oneRot)735 void TextLineFrag::computeCoords(GBool oneRot) {
736 TextBlock *blk;
737 double d0, d1, d2, d3, d4;
738
739 if (oneRot) {
740
741 switch (line->rot) {
742 case 0:
743 xMin = line->edge[start];
744 xMax = line->edge[start + len];
745 yMin = line->yMin;
746 yMax = line->yMax;
747 break;
748 case 1:
749 xMin = line->xMin;
750 xMax = line->xMax;
751 yMin = line->edge[start];
752 yMax = line->edge[start + len];
753 break;
754 case 2:
755 xMin = line->edge[start + len];
756 xMax = line->edge[start];
757 yMin = line->yMin;
758 yMax = line->yMax;
759 break;
760 case 3:
761 xMin = line->xMin;
762 xMax = line->xMax;
763 yMin = line->edge[start + len];
764 yMax = line->edge[start];
765 break;
766 }
767 base = line->base;
768
769 } else {
770
771 if (line->rot == 0 && line->blk->page->primaryRot == 0) {
772
773 xMin = line->edge[start];
774 xMax = line->edge[start + len];
775 yMin = line->yMin;
776 yMax = line->yMax;
777 base = line->base;
778
779 } else {
780
781 blk = line->blk;
782 d0 = line->edge[start];
783 d1 = line->edge[start + len];
784 d2 = d3 = d4 = 0; // make gcc happy
785
786 switch (line->rot) {
787 case 0:
788 d2 = line->yMin;
789 d3 = line->yMax;
790 d4 = line->base;
791 d0 = (d0 - blk->xMin) / (blk->xMax - blk->xMin);
792 d1 = (d1 - blk->xMin) / (blk->xMax - blk->xMin);
793 d2 = (d2 - blk->yMin) / (blk->yMax - blk->yMin);
794 d3 = (d3 - blk->yMin) / (blk->yMax - blk->yMin);
795 d4 = (d4 - blk->yMin) / (blk->yMax - blk->yMin);
796 break;
797 case 1:
798 d2 = line->xMax;
799 d3 = line->xMin;
800 d4 = line->base;
801 d0 = (d0 - blk->yMin) / (blk->yMax - blk->yMin);
802 d1 = (d1 - blk->yMin) / (blk->yMax - blk->yMin);
803 d2 = (blk->xMax - d2) / (blk->xMax - blk->xMin);
804 d3 = (blk->xMax - d3) / (blk->xMax - blk->xMin);
805 d4 = (blk->xMax - d4) / (blk->xMax - blk->xMin);
806 break;
807 case 2:
808 d2 = line->yMax;
809 d3 = line->yMin;
810 d4 = line->base;
811 d0 = (blk->xMax - d0) / (blk->xMax - blk->xMin);
812 d1 = (blk->xMax - d1) / (blk->xMax - blk->xMin);
813 d2 = (blk->yMax - d2) / (blk->yMax - blk->yMin);
814 d3 = (blk->yMax - d3) / (blk->yMax - blk->yMin);
815 d4 = (blk->yMax - d4) / (blk->yMax - blk->yMin);
816 break;
817 case 3:
818 d2 = line->xMin;
819 d3 = line->xMax;
820 d4 = line->base;
821 d0 = (blk->yMax - d0) / (blk->yMax - blk->yMin);
822 d1 = (blk->yMax - d1) / (blk->yMax - blk->yMin);
823 d2 = (d2 - blk->xMin) / (blk->xMax - blk->xMin);
824 d3 = (d3 - blk->xMin) / (blk->xMax - blk->xMin);
825 d4 = (d4 - blk->xMin) / (blk->xMax - blk->xMin);
826 break;
827 }
828
829 switch (line->blk->page->primaryRot) {
830 case 0:
831 xMin = blk->xMin + d0 * (blk->xMax - blk->xMin);
832 xMax = blk->xMin + d1 * (blk->xMax - blk->xMin);
833 yMin = blk->yMin + d2 * (blk->yMax - blk->yMin);
834 yMax = blk->yMin + d3 * (blk->yMax - blk->yMin);
835 base = blk->yMin + base * (blk->yMax - blk->yMin);
836 break;
837 case 1:
838 xMin = blk->xMax - d3 * (blk->xMax - blk->xMin);
839 xMax = blk->xMax - d2 * (blk->xMax - blk->xMin);
840 yMin = blk->yMin + d0 * (blk->yMax - blk->yMin);
841 yMax = blk->yMin + d1 * (blk->yMax - blk->yMin);
842 base = blk->xMax - d4 * (blk->xMax - blk->xMin);
843 break;
844 case 2:
845 xMin = blk->xMax - d1 * (blk->xMax - blk->xMin);
846 xMax = blk->xMax - d0 * (blk->xMax - blk->xMin);
847 yMin = blk->yMax - d3 * (blk->yMax - blk->yMin);
848 yMax = blk->yMax - d2 * (blk->yMax - blk->yMin);
849 base = blk->yMax - d4 * (blk->yMax - blk->yMin);
850 break;
851 case 3:
852 xMin = blk->xMin + d2 * (blk->xMax - blk->xMin);
853 xMax = blk->xMin + d3 * (blk->xMax - blk->xMin);
854 yMin = blk->yMax - d1 * (blk->yMax - blk->yMin);
855 yMax = blk->yMax - d0 * (blk->yMax - blk->yMin);
856 base = blk->xMin + d4 * (blk->xMax - blk->xMin);
857 break;
858 }
859
860 }
861 }
862 }
863
cmpYXPrimaryRot(const void * p1,const void * p2)864 int TextLineFrag::cmpYXPrimaryRot(const void *p1, const void *p2) {
865 TextLineFrag *frag1 = (TextLineFrag *)p1;
866 TextLineFrag *frag2 = (TextLineFrag *)p2;
867 double cmp;
868
869 cmp = 0; // make gcc happy
870 switch (frag1->line->blk->page->primaryRot) {
871 case 0:
872 if (fabs(cmp = frag1->yMin - frag2->yMin) < 0.01) {
873 cmp = frag1->xMin - frag2->xMin;
874 }
875 break;
876 case 1:
877 if (fabs(cmp = frag2->xMax - frag1->xMax) < 0.01) {
878 cmp = frag1->yMin - frag2->yMin;
879 }
880 break;
881 case 2:
882 if (fabs(cmp = frag2->yMin - frag1->yMin) < 0.01) {
883 cmp = frag2->xMax - frag1->xMax;
884 }
885 break;
886 case 3:
887 if (fabs(cmp = frag1->xMax - frag2->xMax) < 0.01) {
888 cmp = frag2->yMax - frag1->yMax;
889 }
890 break;
891 }
892 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
893 }
894
cmpYXLineRot(const void * p1,const void * p2)895 int TextLineFrag::cmpYXLineRot(const void *p1, const void *p2) {
896 TextLineFrag *frag1 = (TextLineFrag *)p1;
897 TextLineFrag *frag2 = (TextLineFrag *)p2;
898 double cmp;
899
900 cmp = 0; // make gcc happy
901 switch (frag1->line->rot) {
902 case 0:
903 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
904 cmp = frag1->xMin - frag2->xMin;
905 }
906 break;
907 case 1:
908 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
909 cmp = frag1->yMin - frag2->yMin;
910 }
911 break;
912 case 2:
913 if ((cmp = frag2->yMin - frag1->yMin) == 0) {
914 cmp = frag2->xMax - frag1->xMax;
915 }
916 break;
917 case 3:
918 if ((cmp = frag1->xMax - frag2->xMax) == 0) {
919 cmp = frag2->yMax - frag1->yMax;
920 }
921 break;
922 }
923 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
924 }
925
cmpXYLineRot(const void * p1,const void * p2)926 int TextLineFrag::cmpXYLineRot(const void *p1, const void *p2) {
927 TextLineFrag *frag1 = (TextLineFrag *)p1;
928 TextLineFrag *frag2 = (TextLineFrag *)p2;
929 double cmp;
930
931 cmp = 0; // make gcc happy
932 switch (frag1->line->rot) {
933 case 0:
934 if ((cmp = frag1->xMin - frag2->xMin) == 0) {
935 cmp = frag1->yMin - frag2->yMin;
936 }
937 break;
938 case 1:
939 if ((cmp = frag1->yMin - frag2->yMin) == 0) {
940 cmp = frag2->xMax - frag1->xMax;
941 }
942 break;
943 case 2:
944 if ((cmp = frag2->xMax - frag1->xMax) == 0) {
945 cmp = frag2->yMin - frag1->yMin;
946 }
947 break;
948 case 3:
949 if ((cmp = frag2->yMax - frag1->yMax) == 0) {
950 cmp = frag1->xMax - frag2->xMax;
951 }
952 break;
953 }
954 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
955 }
956
957 //------------------------------------------------------------------------
958 // TextBlock
959 //------------------------------------------------------------------------
960
TextBlock(TextPage * pageA,int rotA)961 TextBlock::TextBlock(TextPage *pageA, int rotA) {
962 page = pageA;
963 rot = rotA;
964 xMin = yMin = 0;
965 xMax = yMax = -1;
966 priMin = 0;
967 priMax = page->pageWidth;
968 pool = new TextPool();
969 lines = NULL;
970 curLine = NULL;
971 next = NULL;
972 stackNext = NULL;
973 }
974
~TextBlock()975 TextBlock::~TextBlock() {
976 TextLine *line;
977
978 delete pool;
979 while (lines) {
980 line = lines;
981 lines = lines->next;
982 delete line;
983 }
984 }
985
addWord(TextWord * word)986 void TextBlock::addWord(TextWord *word) {
987 pool->addWord(word);
988 if (xMin > xMax) {
989 xMin = word->xMin;
990 xMax = word->xMax;
991 yMin = word->yMin;
992 yMax = word->yMax;
993 } else {
994 if (word->xMin < xMin) {
995 xMin = word->xMin;
996 }
997 if (word->xMax > xMax) {
998 xMax = word->xMax;
999 }
1000 if (word->yMin < yMin) {
1001 yMin = word->yMin;
1002 }
1003 if (word->yMax > yMax) {
1004 yMax = word->yMax;
1005 }
1006 }
1007 }
1008
coalesce(UnicodeMap * uMap)1009 void TextBlock::coalesce(UnicodeMap *uMap) {
1010 TextWord *word0, *word1, *word2, *bestWord0, *bestWord1, *lastWord;
1011 TextLine *line, *line0, *line1;
1012 int poolMinBaseIdx, startBaseIdx, minBaseIdx, maxBaseIdx;
1013 int baseIdx, bestWordBaseIdx, idx0, idx1;
1014 double minBase, maxBase;
1015 double fontSize, delta, priDelta, secDelta;
1016 TextLine **lineArray;
1017 GBool found;
1018 int col1, col2;
1019 int i, j, k;
1020
1021 // discard duplicated text (fake boldface, drop shadows)
1022 for (idx0 = pool->minBaseIdx; idx0 <= pool->maxBaseIdx; ++idx0) {
1023 word0 = pool->getPool(idx0);
1024 while (word0) {
1025 priDelta = dupMaxPriDelta * word0->fontSize;
1026 secDelta = dupMaxSecDelta * word0->fontSize;
1027 if (rot == 0 || rot == 3) {
1028 maxBaseIdx = pool->getBaseIdx(word0->base + secDelta);
1029 } else {
1030 maxBaseIdx = pool->getBaseIdx(word0->base - secDelta);
1031 }
1032 found = gFalse;
1033 word1 = word2 = NULL; // make gcc happy
1034 for (idx1 = idx0; idx1 <= maxBaseIdx; ++idx1) {
1035 if (idx1 == idx0) {
1036 word1 = word0;
1037 word2 = word0->next;
1038 } else {
1039 word1 = NULL;
1040 word2 = pool->getPool(idx1);
1041 }
1042 for (; word2; word1 = word2, word2 = word2->next) {
1043 if (word2->len == word0->len &&
1044 !memcmp(word2->text, word0->text,
1045 word0->len * sizeof(Unicode))) {
1046 switch (rot) {
1047 case 0:
1048 case 2:
1049 found = fabs(word0->xMin - word2->xMin) < priDelta &&
1050 fabs(word0->xMax - word2->xMax) < priDelta &&
1051 fabs(word0->yMin - word2->yMin) < secDelta &&
1052 fabs(word0->yMax - word2->yMax) < secDelta;
1053 break;
1054 case 1:
1055 case 3:
1056 found = fabs(word0->xMin - word2->xMin) < secDelta &&
1057 fabs(word0->xMax - word2->xMax) < secDelta &&
1058 fabs(word0->yMin - word2->yMin) < priDelta &&
1059 fabs(word0->yMax - word2->yMax) < priDelta;
1060 break;
1061 }
1062 }
1063 if (found) {
1064 break;
1065 }
1066 }
1067 if (found) {
1068 break;
1069 }
1070 }
1071 if (found) {
1072 if (word1) {
1073 word1->next = word2->next;
1074 } else {
1075 pool->setPool(idx1, word2->next);
1076 }
1077 delete word2;
1078 } else {
1079 word0 = word0->next;
1080 }
1081 }
1082 }
1083
1084 // build the lines
1085 curLine = NULL;
1086 poolMinBaseIdx = pool->minBaseIdx;
1087 charCount = 0;
1088 nLines = 0;
1089 while (1) {
1090
1091 // find the first non-empty line in the pool
1092 for (;
1093 poolMinBaseIdx <= pool->maxBaseIdx && !pool->getPool(poolMinBaseIdx);
1094 ++poolMinBaseIdx) ;
1095 if (poolMinBaseIdx > pool->maxBaseIdx) {
1096 break;
1097 }
1098
1099 // look for the left-most word in the first four lines of the
1100 // pool -- this avoids starting with a superscript word
1101 startBaseIdx = poolMinBaseIdx;
1102 for (baseIdx = poolMinBaseIdx + 1;
1103 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
1104 ++baseIdx) {
1105 if (!pool->getPool(baseIdx)) {
1106 continue;
1107 }
1108 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
1109 < 0) {
1110 startBaseIdx = baseIdx;
1111 }
1112 }
1113
1114 // create a new line
1115 word0 = pool->getPool(startBaseIdx);
1116 pool->setPool(startBaseIdx, word0->next);
1117 word0->next = NULL;
1118 line = new TextLine(this, word0->rot, word0->base);
1119 line->addWord(word0);
1120 lastWord = word0;
1121
1122 // compute the search range
1123 fontSize = word0->fontSize;
1124 minBase = word0->base - maxIntraLineDelta * fontSize;
1125 maxBase = word0->base + maxIntraLineDelta * fontSize;
1126 minBaseIdx = pool->getBaseIdx(minBase);
1127 maxBaseIdx = pool->getBaseIdx(maxBase);
1128
1129 // find the rest of the words in this line
1130 while (1) {
1131
1132 // find the left-most word whose baseline is in the range for
1133 // this line
1134 bestWordBaseIdx = 0;
1135 bestWord0 = bestWord1 = NULL;
1136 for (baseIdx = minBaseIdx; baseIdx <= maxBaseIdx; ++baseIdx) {
1137 for (word0 = NULL, word1 = pool->getPool(baseIdx);
1138 word1;
1139 word0 = word1, word1 = word1->next) {
1140 if (word1->base >= minBase &&
1141 word1->base <= maxBase &&
1142 (delta = lastWord->primaryDelta(word1)) >=
1143 minCharSpacing * fontSize) {
1144 if (delta < maxWordSpacing * fontSize &&
1145 (!bestWord1 || word1->primaryCmp(bestWord1) < 0)) {
1146 bestWordBaseIdx = baseIdx;
1147 bestWord0 = word0;
1148 bestWord1 = word1;
1149 }
1150 break;
1151 }
1152 }
1153 }
1154 if (!bestWord1) {
1155 break;
1156 }
1157
1158 // remove it from the pool, and add it to the line
1159 if (bestWord0) {
1160 bestWord0->next = bestWord1->next;
1161 } else {
1162 pool->setPool(bestWordBaseIdx, bestWord1->next);
1163 }
1164 bestWord1->next = NULL;
1165 line->addWord(bestWord1);
1166 lastWord = bestWord1;
1167 }
1168
1169 // add the line
1170 if (curLine && line->cmpYX(curLine) > 0) {
1171 line0 = curLine;
1172 line1 = curLine->next;
1173 } else {
1174 line0 = NULL;
1175 line1 = lines;
1176 }
1177 for (;
1178 line1 && line->cmpYX(line1) > 0;
1179 line0 = line1, line1 = line1->next) ;
1180 if (line0) {
1181 line0->next = line;
1182 } else {
1183 lines = line;
1184 }
1185 line->next = line1;
1186 curLine = line;
1187 line->coalesce(uMap);
1188 charCount += line->len;
1189 ++nLines;
1190 }
1191
1192 // sort lines into xy order for column assignment
1193 lineArray = (TextLine **)gmallocn(nLines, sizeof(TextLine *));
1194 for (line = lines, i = 0; line; line = line->next, ++i) {
1195 lineArray[i] = line;
1196 }
1197 qsort(lineArray, nLines, sizeof(TextLine *), &TextLine::cmpXY);
1198
1199 // column assignment
1200 nColumns = 0;
1201 for (i = 0; i < nLines; ++i) {
1202 line0 = lineArray[i];
1203 col1 = 0;
1204 for (j = 0; j < i; ++j) {
1205 line1 = lineArray[j];
1206 if (line1->primaryDelta(line0) >= 0) {
1207 col2 = line1->col[line1->len] + 1;
1208 } else {
1209 k = 0; // make gcc happy
1210 switch (rot) {
1211 case 0:
1212 for (k = 0;
1213 k < line1->len &&
1214 line0->xMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1215 ++k) ;
1216 break;
1217 case 1:
1218 for (k = 0;
1219 k < line1->len &&
1220 line0->yMin >= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1221 ++k) ;
1222 break;
1223 case 2:
1224 for (k = 0;
1225 k < line1->len &&
1226 line0->xMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1227 ++k) ;
1228 break;
1229 case 3:
1230 for (k = 0;
1231 k < line1->len &&
1232 line0->yMax <= 0.5 * (line1->edge[k] + line1->edge[k+1]);
1233 ++k) ;
1234 break;
1235 }
1236 col2 = line1->col[k];
1237 }
1238 if (col2 > col1) {
1239 col1 = col2;
1240 }
1241 }
1242 for (k = 0; k <= line0->len; ++k) {
1243 line0->col[k] += col1;
1244 }
1245 if (line0->col[line0->len] > nColumns) {
1246 nColumns = line0->col[line0->len];
1247 }
1248 }
1249 gfree(lineArray);
1250 }
1251
updatePriMinMax(TextBlock * blk)1252 void TextBlock::updatePriMinMax(TextBlock *blk) {
1253 double newPriMin, newPriMax;
1254 GBool gotPriMin, gotPriMax;
1255
1256 gotPriMin = gotPriMax = gFalse;
1257 newPriMin = newPriMax = 0; // make gcc happy
1258 switch (page->primaryRot) {
1259 case 0:
1260 case 2:
1261 if (blk->yMin < yMax && blk->yMax > yMin) {
1262 if (blk->xMin < xMin) {
1263 newPriMin = blk->xMax;
1264 gotPriMin = gTrue;
1265 }
1266 if (blk->xMax > xMax) {
1267 newPriMax = blk->xMin;
1268 gotPriMax = gTrue;
1269 }
1270 }
1271 break;
1272 case 1:
1273 case 3:
1274 if (blk->xMin < xMax && blk->xMax > xMin) {
1275 if (blk->yMin < yMin) {
1276 newPriMin = blk->yMax;
1277 gotPriMin = gTrue;
1278 }
1279 if (blk->yMax > yMax) {
1280 newPriMax = blk->yMin;
1281 gotPriMax = gTrue;
1282 }
1283 }
1284 break;
1285 }
1286 if (gotPriMin) {
1287 if (newPriMin > xMin) {
1288 newPriMin = xMin;
1289 }
1290 if (newPriMin > priMin) {
1291 priMin = newPriMin;
1292 }
1293 }
1294 if (gotPriMax) {
1295 if (newPriMax < xMax) {
1296 newPriMax = xMax;
1297 }
1298 if (newPriMax < priMax) {
1299 priMax = newPriMax;
1300 }
1301 }
1302 }
1303
cmpXYPrimaryRot(const void * p1,const void * p2)1304 int TextBlock::cmpXYPrimaryRot(const void *p1, const void *p2) {
1305 TextBlock *blk1 = *(TextBlock **)p1;
1306 TextBlock *blk2 = *(TextBlock **)p2;
1307 double cmp;
1308
1309 cmp = 0; // make gcc happy
1310 switch (blk1->page->primaryRot) {
1311 case 0:
1312 if ((cmp = blk1->xMin - blk2->xMin) == 0) {
1313 cmp = blk1->yMin - blk2->yMin;
1314 }
1315 break;
1316 case 1:
1317 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1318 cmp = blk2->xMax - blk1->xMax;
1319 }
1320 break;
1321 case 2:
1322 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1323 cmp = blk2->yMin - blk1->yMin;
1324 }
1325 break;
1326 case 3:
1327 if ((cmp = blk2->yMax - blk1->yMax) == 0) {
1328 cmp = blk1->xMax - blk2->xMax;
1329 }
1330 break;
1331 }
1332 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1333 }
1334
cmpYXPrimaryRot(const void * p1,const void * p2)1335 int TextBlock::cmpYXPrimaryRot(const void *p1, const void *p2) {
1336 TextBlock *blk1 = *(TextBlock **)p1;
1337 TextBlock *blk2 = *(TextBlock **)p2;
1338 double cmp;
1339
1340 cmp = 0; // make gcc happy
1341 switch (blk1->page->primaryRot) {
1342 case 0:
1343 if ((cmp = blk1->yMin - blk2->yMin) == 0) {
1344 cmp = blk1->xMin - blk2->xMin;
1345 }
1346 break;
1347 case 1:
1348 if ((cmp = blk2->xMax - blk1->xMax) == 0) {
1349 cmp = blk1->yMin - blk2->yMin;
1350 }
1351 break;
1352 case 2:
1353 if ((cmp = blk2->yMin - blk1->yMin) == 0) {
1354 cmp = blk2->xMax - blk1->xMax;
1355 }
1356 break;
1357 case 3:
1358 if ((cmp = blk1->xMax - blk2->xMax) == 0) {
1359 cmp = blk2->yMax - blk1->yMax;
1360 }
1361 break;
1362 }
1363 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1364 }
1365
primaryCmp(TextBlock * blk)1366 int TextBlock::primaryCmp(TextBlock *blk) {
1367 double cmp;
1368
1369 cmp = 0; // make gcc happy
1370 switch (rot) {
1371 case 0:
1372 cmp = xMin - blk->xMin;
1373 break;
1374 case 1:
1375 cmp = yMin - blk->yMin;
1376 break;
1377 case 2:
1378 cmp = blk->xMax - xMax;
1379 break;
1380 case 3:
1381 cmp = blk->yMax - yMax;
1382 break;
1383 }
1384 return cmp < 0 ? -1 : cmp > 0 ? 1 : 0;
1385 }
1386
secondaryDelta(TextBlock * blk)1387 double TextBlock::secondaryDelta(TextBlock *blk) {
1388 double delta;
1389
1390 delta = 0; // make gcc happy
1391 switch (rot) {
1392 case 0:
1393 delta = blk->yMin - yMax;
1394 break;
1395 case 1:
1396 delta = xMin - blk->xMax;
1397 break;
1398 case 2:
1399 delta = yMin - blk->yMax;
1400 break;
1401 case 3:
1402 delta = blk->xMin - xMax;
1403 break;
1404 }
1405 return delta;
1406 }
1407
isBelow(TextBlock * blk)1408 GBool TextBlock::isBelow(TextBlock *blk) {
1409 GBool below;
1410
1411 below = gFalse; // make gcc happy
1412 switch (page->primaryRot) {
1413 case 0:
1414 below = xMin >= blk->priMin && xMax <= blk->priMax &&
1415 yMin > blk->yMin;
1416 break;
1417 case 1:
1418 below = yMin >= blk->priMin && yMax <= blk->priMax &&
1419 xMax < blk->xMax;
1420 break;
1421 case 2:
1422 below = xMin >= blk->priMin && xMax <= blk->priMax &&
1423 yMax < blk->yMax;
1424 break;
1425 case 3:
1426 below = yMin >= blk->priMin && yMax <= blk->priMax &&
1427 xMin > blk->xMin;
1428 break;
1429 }
1430
1431 return below;
1432 }
1433
1434 //------------------------------------------------------------------------
1435 // TextFlow
1436 //------------------------------------------------------------------------
1437
TextFlow(TextPage * pageA,TextBlock * blk)1438 TextFlow::TextFlow(TextPage *pageA, TextBlock *blk) {
1439 page = pageA;
1440 xMin = blk->xMin;
1441 xMax = blk->xMax;
1442 yMin = blk->yMin;
1443 yMax = blk->yMax;
1444 priMin = blk->priMin;
1445 priMax = blk->priMax;
1446 blocks = lastBlk = blk;
1447 next = NULL;
1448 }
1449
~TextFlow()1450 TextFlow::~TextFlow() {
1451 TextBlock *blk;
1452
1453 while (blocks) {
1454 blk = blocks;
1455 blocks = blocks->next;
1456 delete blk;
1457 }
1458 }
1459
addBlock(TextBlock * blk)1460 void TextFlow::addBlock(TextBlock *blk) {
1461 if (lastBlk) {
1462 lastBlk->next = blk;
1463 } else {
1464 blocks = blk;
1465 }
1466 lastBlk = blk;
1467 if (blk->xMin < xMin) {
1468 xMin = blk->xMin;
1469 }
1470 if (blk->xMax > xMax) {
1471 xMax = blk->xMax;
1472 }
1473 if (blk->yMin < yMin) {
1474 yMin = blk->yMin;
1475 }
1476 if (blk->yMax > yMax) {
1477 yMax = blk->yMax;
1478 }
1479 }
1480
blockFits(TextBlock * blk,TextBlock * prevBlk)1481 GBool TextFlow::blockFits(TextBlock *blk, TextBlock *prevBlk) {
1482 GBool fits;
1483
1484 // lower blocks must use smaller fonts
1485 if (blk->lines->words->fontSize > lastBlk->lines->words->fontSize) {
1486 return gFalse;
1487 }
1488
1489 fits = gFalse; // make gcc happy
1490 switch (page->primaryRot) {
1491 case 0:
1492 fits = blk->xMin >= priMin && blk->xMax <= priMax;
1493 break;
1494 case 1:
1495 fits = blk->yMin >= priMin && blk->yMax <= priMax;
1496 break;
1497 case 2:
1498 fits = blk->xMin >= priMin && blk->xMax <= priMax;
1499 break;
1500 case 3:
1501 fits = blk->yMin >= priMin && blk->yMax <= priMax;
1502 break;
1503 }
1504 return fits;
1505 }
1506
1507 #if TEXTOUT_WORD_LIST
1508
1509 //------------------------------------------------------------------------
1510 // TextWordList
1511 //------------------------------------------------------------------------
1512
TextWordList(TextPage * text,GBool physLayout)1513 TextWordList::TextWordList(TextPage *text, GBool physLayout) {
1514 TextFlow *flow;
1515 TextBlock *blk;
1516 TextLine *line;
1517 TextWord *word;
1518 TextWord **wordArray;
1519 int nWords, i;
1520
1521 words = new GList();
1522
1523 if (text->rawOrder) {
1524 for (word = text->rawWords; word; word = word->next) {
1525 words->append(word);
1526 }
1527
1528 } else if (physLayout) {
1529 // this is inefficient, but it's also the least useful of these
1530 // three cases
1531 nWords = 0;
1532 for (flow = text->flows; flow; flow = flow->next) {
1533 for (blk = flow->blocks; blk; blk = blk->next) {
1534 for (line = blk->lines; line; line = line->next) {
1535 for (word = line->words; word; word = word->next) {
1536 ++nWords;
1537 }
1538 }
1539 }
1540 }
1541 wordArray = (TextWord **)gmallocn(nWords, sizeof(TextWord *));
1542 i = 0;
1543 for (flow = text->flows; flow; flow = flow->next) {
1544 for (blk = flow->blocks; blk; blk = blk->next) {
1545 for (line = blk->lines; line; line = line->next) {
1546 for (word = line->words; word; word = word->next) {
1547 wordArray[i++] = word;
1548 }
1549 }
1550 }
1551 }
1552 qsort(wordArray, nWords, sizeof(TextWord *), &TextWord::cmpYX);
1553 for (i = 0; i < nWords; ++i) {
1554 words->append(wordArray[i]);
1555 }
1556 gfree(wordArray);
1557
1558 } else {
1559 for (flow = text->flows; flow; flow = flow->next) {
1560 for (blk = flow->blocks; blk; blk = blk->next) {
1561 for (line = blk->lines; line; line = line->next) {
1562 for (word = line->words; word; word = word->next) {
1563 words->append(word);
1564 }
1565 }
1566 }
1567 }
1568 }
1569 }
1570
~TextWordList()1571 TextWordList::~TextWordList() {
1572 delete words;
1573 }
1574
getLength()1575 int TextWordList::getLength() {
1576 return words->getLength();
1577 }
1578
get(int idx)1579 TextWord *TextWordList::get(int idx) {
1580 if (idx < 0 || idx >= words->getLength()) {
1581 return NULL;
1582 }
1583 return (TextWord *)words->get(idx);
1584 }
1585
1586 #endif // TEXTOUT_WORD_LIST
1587
1588 //------------------------------------------------------------------------
1589 // TextPage
1590 //------------------------------------------------------------------------
1591
TextPage(GBool rawOrderA)1592 TextPage::TextPage(GBool rawOrderA) {
1593 int rot;
1594
1595 rawOrder = rawOrderA;
1596 curWord = NULL;
1597 charPos = 0;
1598 curFont = NULL;
1599 curFontSize = 0;
1600 nest = 0;
1601 nTinyChars = 0;
1602 lastCharOverlap = gFalse;
1603 if (!rawOrder) {
1604 for (rot = 0; rot < 4; ++rot) {
1605 pools[rot] = new TextPool();
1606 }
1607 }
1608 flows = NULL;
1609 blocks = NULL;
1610 rawWords = NULL;
1611 rawLastWord = NULL;
1612 fonts = new GList();
1613 lastFindXMin = lastFindYMin = 0;
1614 haveLastFind = gFalse;
1615 }
1616
~TextPage()1617 TextPage::~TextPage() {
1618 int rot;
1619
1620 clear();
1621 if (!rawOrder) {
1622 for (rot = 0; rot < 4; ++rot) {
1623 delete pools[rot];
1624 }
1625 }
1626 delete fonts;
1627 }
1628
startPage(GfxState * state)1629 void TextPage::startPage(GfxState *state) {
1630 clear();
1631 if (state) {
1632 pageWidth = state->getPageWidth();
1633 pageHeight = state->getPageHeight();
1634 } else {
1635 pageWidth = pageHeight = 0;
1636 }
1637 }
1638
endPage()1639 void TextPage::endPage() {
1640 if (curWord) {
1641 endWord();
1642 }
1643 }
1644
clear()1645 void TextPage::clear() {
1646 int rot;
1647 TextFlow *flow;
1648 TextWord *word;
1649
1650 if (curWord) {
1651 delete curWord;
1652 curWord = NULL;
1653 }
1654 if (rawOrder) {
1655 while (rawWords) {
1656 word = rawWords;
1657 rawWords = rawWords->next;
1658 delete word;
1659 }
1660 } else {
1661 for (rot = 0; rot < 4; ++rot) {
1662 delete pools[rot];
1663 }
1664 while (flows) {
1665 flow = flows;
1666 flows = flows->next;
1667 delete flow;
1668 }
1669 gfree(blocks);
1670 }
1671 deleteGList(fonts, TextFontInfo);
1672
1673 curWord = NULL;
1674 charPos = 0;
1675 curFont = NULL;
1676 curFontSize = 0;
1677 nest = 0;
1678 nTinyChars = 0;
1679 if (!rawOrder) {
1680 for (rot = 0; rot < 4; ++rot) {
1681 pools[rot] = new TextPool();
1682 }
1683 }
1684 flows = NULL;
1685 blocks = NULL;
1686 rawWords = NULL;
1687 rawLastWord = NULL;
1688 fonts = new GList();
1689 }
1690
updateFont(GfxState * state)1691 void TextPage::updateFont(GfxState *state) {
1692 GfxFont *gfxFont;
1693 double *fm;
1694 char *name;
1695 int code, mCode, letterCode, anyCode;
1696 double w;
1697 int i;
1698
1699 // get the font info object
1700 curFont = NULL;
1701 for (i = 0; i < fonts->getLength(); ++i) {
1702 curFont = (TextFontInfo *)fonts->get(i);
1703 if (curFont->matches(state)) {
1704 break;
1705 }
1706 curFont = NULL;
1707 }
1708 if (!curFont) {
1709 curFont = new TextFontInfo(state);
1710 fonts->append(curFont);
1711 }
1712
1713 // adjust the font size
1714 gfxFont = state->getFont();
1715 curFontSize = state->getTransformedFontSize();
1716 if (gfxFont && gfxFont->getType() == fontType3) {
1717 // This is a hack which makes it possible to deal with some Type 3
1718 // fonts. The problem is that it's impossible to know what the
1719 // base coordinate system used in the font is without actually
1720 // rendering the font. This code tries to guess by looking at the
1721 // width of the character 'm' (which breaks if the font is a
1722 // subset that doesn't contain 'm').
1723 mCode = letterCode = anyCode = -1;
1724 for (code = 0; code < 256; ++code) {
1725 name = ((Gfx8BitFont *)gfxFont)->getCharName(code);
1726 if (name && name[0] == 'm' && name[1] == '\0') {
1727 mCode = code;
1728 }
1729 if (letterCode < 0 && name && name[1] == '\0' &&
1730 ((name[0] >= 'A' && name[0] <= 'Z') ||
1731 (name[0] >= 'a' && name[0] <= 'z'))) {
1732 letterCode = code;
1733 }
1734 if (anyCode < 0 && name &&
1735 ((Gfx8BitFont *)gfxFont)->getWidth(code) > 0) {
1736 anyCode = code;
1737 }
1738 }
1739 if (mCode >= 0 &&
1740 (w = ((Gfx8BitFont *)gfxFont)->getWidth(mCode)) > 0) {
1741 // 0.6 is a generic average 'm' width -- yes, this is a hack
1742 curFontSize *= w / 0.6;
1743 } else if (letterCode >= 0 &&
1744 (w = ((Gfx8BitFont *)gfxFont)->getWidth(letterCode)) > 0) {
1745 // even more of a hack: 0.5 is a generic letter width
1746 curFontSize *= w / 0.5;
1747 } else if (anyCode >= 0 &&
1748 (w = ((Gfx8BitFont *)gfxFont)->getWidth(anyCode)) > 0) {
1749 // better than nothing: 0.5 is a generic character width
1750 curFontSize *= w / 0.5;
1751 }
1752 fm = gfxFont->getFontMatrix();
1753 if (fm[0] != 0) {
1754 curFontSize *= fabs(fm[3] / fm[0]);
1755 }
1756 }
1757 }
1758
beginWord(GfxState * state,double x0,double y0)1759 void TextPage::beginWord(GfxState *state, double x0, double y0) {
1760 double *fontm;
1761 double m[4], m2[4];
1762 int rot;
1763
1764 // This check is needed because Type 3 characters can contain
1765 // text-drawing operations (when TextPage is being used via
1766 // {X,Win}SplashOutputDev rather than TextOutputDev).
1767 if (curWord) {
1768 ++nest;
1769 return;
1770 }
1771
1772 // compute the rotation
1773 state->getFontTransMat(&m[0], &m[1], &m[2], &m[3]);
1774 if (state->getFont()->getType() == fontType3) {
1775 fontm = state->getFont()->getFontMatrix();
1776 m2[0] = fontm[0] * m[0] + fontm[1] * m[2];
1777 m2[1] = fontm[0] * m[1] + fontm[1] * m[3];
1778 m2[2] = fontm[2] * m[0] + fontm[3] * m[2];
1779 m2[3] = fontm[2] * m[1] + fontm[3] * m[3];
1780 m[0] = m2[0];
1781 m[1] = m2[1];
1782 m[2] = m2[2];
1783 m[3] = m2[3];
1784 }
1785 if (fabs(m[0] * m[3]) > fabs(m[1] * m[2])) {
1786 rot = (m[3] < 0) ? 0 : 2;
1787 } else {
1788 rot = (m[2] > 0) ? 1 : 3;
1789 }
1790
1791 curWord = new TextWord(state, rot, x0, y0, charPos, curFont, curFontSize);
1792 }
1793
addChar(GfxState * state,double x,double y,double dx,double dy,CharCode c,int nBytes,Unicode * u,int uLen)1794 void TextPage::addChar(GfxState *state, double x, double y,
1795 double dx, double dy,
1796 CharCode c, int nBytes, Unicode *u, int uLen) {
1797 double x1, y1, w1, h1, dx2, dy2, base, sp, delta;
1798 GBool overlap;
1799 int i;
1800
1801 // throw away chars that aren't inside the page bounds
1802 state->transform(x, y, &x1, &y1);
1803 if (x1 < 0 || x1 > pageWidth ||
1804 y1 < 0 || y1 > pageHeight) {
1805 charPos += nBytes;
1806 return;
1807 }
1808
1809 // subtract char and word spacing from the dx,dy values
1810 sp = state->getCharSpace();
1811 if (c == (CharCode)0x20) {
1812 sp += state->getWordSpace();
1813 }
1814 state->textTransformDelta(sp * state->getHorizScaling(), 0, &dx2, &dy2);
1815 dx -= dx2;
1816 dy -= dy2;
1817 state->transformDelta(dx, dy, &w1, &h1);
1818
1819 // check the tiny chars limit
1820 if (!globalParams->getTextKeepTinyChars() &&
1821 fabs(w1) < 3 && fabs(h1) < 3) {
1822 if (++nTinyChars > 50000) {
1823 charPos += nBytes;
1824 return;
1825 }
1826 }
1827
1828 // break words at space character
1829 if (uLen == 1 && u[0] == (Unicode)0x20) {
1830 if (curWord) {
1831 ++curWord->charLen;
1832 }
1833 charPos += nBytes;
1834 endWord();
1835 return;
1836 }
1837
1838 // start a new word if:
1839 // (1) this character doesn't fall in the right place relative to
1840 // the end of the previous word (this places upper and lower
1841 // constraints on the position deltas along both the primary
1842 // and secondary axes), or
1843 // (2) this character overlaps the previous one (duplicated text), or
1844 // (3) the previous character was an overlap (we want each duplicated
1845 // character to be in a word by itself at this stage)
1846 if (curWord && curWord->len > 0) {
1847 base = sp = delta = 0; // make gcc happy
1848 switch (curWord->rot) {
1849 case 0:
1850 base = y1;
1851 sp = x1 - curWord->xMax;
1852 delta = x1 - curWord->edge[curWord->len - 1];
1853 break;
1854 case 1:
1855 base = x1;
1856 sp = y1 - curWord->yMax;
1857 delta = y1 - curWord->edge[curWord->len - 1];
1858 break;
1859 case 2:
1860 base = y1;
1861 sp = curWord->xMin - x1;
1862 delta = curWord->edge[curWord->len - 1] - x1;
1863 break;
1864 case 3:
1865 base = x1;
1866 sp = curWord->yMin - y1;
1867 delta = curWord->edge[curWord->len - 1] - y1;
1868 break;
1869 }
1870 overlap = fabs(delta) < dupMaxPriDelta * curWord->fontSize &&
1871 fabs(base - curWord->base) < dupMaxSecDelta * curWord->fontSize;
1872 if (overlap || lastCharOverlap ||
1873 sp < -minDupBreakOverlap * curWord->fontSize ||
1874 sp > minWordBreakSpace * curWord->fontSize ||
1875 fabs(base - curWord->base) > 0.5) {
1876 endWord();
1877 }
1878 lastCharOverlap = overlap;
1879 } else {
1880 lastCharOverlap = gFalse;
1881 }
1882
1883 if (uLen != 0) {
1884 // start a new word if needed
1885 if (!curWord) {
1886 beginWord(state, x, y);
1887 }
1888
1889 // page rotation and/or transform matrices can cause text to be
1890 // drawn in reverse order -- in this case, swap the begin/end
1891 // coordinates and break text into individual chars
1892 if ((curWord->rot == 0 && w1 < 0) ||
1893 (curWord->rot == 1 && h1 < 0) ||
1894 (curWord->rot == 2 && w1 > 0) ||
1895 (curWord->rot == 3 && h1 > 0)) {
1896 endWord();
1897 beginWord(state, x + dx, y + dy);
1898 x1 += w1;
1899 y1 += h1;
1900 w1 = -w1;
1901 h1 = -h1;
1902 }
1903
1904 // add the characters to the current word
1905 w1 /= uLen;
1906 h1 /= uLen;
1907 for (i = 0; i < uLen; ++i) {
1908 curWord->addChar(state, x1 + i*w1, y1 + i*h1, w1, h1, u[i]);
1909 }
1910 }
1911 if (curWord) {
1912 curWord->charLen += nBytes;
1913 }
1914 charPos += nBytes;
1915 }
1916
endWord()1917 void TextPage::endWord() {
1918 // This check is needed because Type 3 characters can contain
1919 // text-drawing operations (when TextPage is being used via
1920 // {X,Win}SplashOutputDev rather than TextOutputDev).
1921 if (nest > 0) {
1922 --nest;
1923 return;
1924 }
1925
1926 if (curWord) {
1927 addWord(curWord);
1928 curWord = NULL;
1929 }
1930 }
1931
addWord(TextWord * word)1932 void TextPage::addWord(TextWord *word) {
1933 // throw away zero-length words -- they don't have valid xMin/xMax
1934 // values, and they're useless anyway
1935 if (word->len == 0) {
1936 delete word;
1937 return;
1938 }
1939
1940 if (rawOrder) {
1941 if (rawLastWord) {
1942 rawLastWord->next = word;
1943 } else {
1944 rawWords = word;
1945 }
1946 rawLastWord = word;
1947 } else {
1948 pools[word->rot]->addWord(word);
1949 }
1950 }
1951
coalesce(GBool physLayout)1952 void TextPage::coalesce(GBool physLayout) {
1953 UnicodeMap *uMap;
1954 TextPool *pool;
1955 TextWord *word0, *word1, *word2;
1956 TextLine *line;
1957 TextBlock *blkList, *blkStack, *blk, *lastBlk, *blk0, *blk1;
1958 TextBlock **blkArray;
1959 TextFlow *flow, *lastFlow;
1960 int rot, poolMinBaseIdx, baseIdx, startBaseIdx;
1961 double minBase, maxBase, newMinBase, newMaxBase;
1962 double fontSize, colSpace1, colSpace2, lineSpace, intraLineSpace, blkSpace;
1963 GBool found;
1964 int count[4];
1965 int lrCount;
1966 int firstBlkIdx, nBlocksLeft;
1967 int col1, col2;
1968 int i, j, n;
1969
1970 if (rawOrder) {
1971 primaryRot = 0;
1972 primaryLR = gTrue;
1973 return;
1974 }
1975
1976 uMap = globalParams->getTextEncoding();
1977 blkList = NULL;
1978 lastBlk = NULL;
1979 nBlocks = 0;
1980 primaryRot = -1;
1981
1982 #if 0 // for debugging
1983 printf("*** initial words ***\n");
1984 for (rot = 0; rot < 4; ++rot) {
1985 pool = pools[rot];
1986 for (baseIdx = pool->minBaseIdx; baseIdx <= pool->maxBaseIdx; ++baseIdx) {
1987 for (word0 = pool->getPool(baseIdx); word0; word0 = word0->next) {
1988 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f rot=%d '",
1989 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
1990 word0->base, word0->fontSize, rot*90);
1991 for (i = 0; i < word0->len; ++i) {
1992 fputc(word0->text[i] & 0xff, stdout);
1993 }
1994 printf("'\n");
1995 }
1996 }
1997 }
1998 printf("\n");
1999 #endif
2000
2001 //----- assemble the blocks
2002
2003 //~ add an outer loop for writing mode (vertical text)
2004
2005 // build blocks for each rotation value
2006 for (rot = 0; rot < 4; ++rot) {
2007 pool = pools[rot];
2008 poolMinBaseIdx = pool->minBaseIdx;
2009 count[rot] = 0;
2010
2011 // add blocks until no more words are left
2012 while (1) {
2013
2014 // find the first non-empty line in the pool
2015 for (;
2016 poolMinBaseIdx <= pool->maxBaseIdx &&
2017 !pool->getPool(poolMinBaseIdx);
2018 ++poolMinBaseIdx) ;
2019 if (poolMinBaseIdx > pool->maxBaseIdx) {
2020 break;
2021 }
2022
2023 // look for the left-most word in the first four lines of the
2024 // pool -- this avoids starting with a superscript word
2025 startBaseIdx = poolMinBaseIdx;
2026 for (baseIdx = poolMinBaseIdx + 1;
2027 baseIdx < poolMinBaseIdx + 4 && baseIdx <= pool->maxBaseIdx;
2028 ++baseIdx) {
2029 if (!pool->getPool(baseIdx)) {
2030 continue;
2031 }
2032 if (pool->getPool(baseIdx)->primaryCmp(pool->getPool(startBaseIdx))
2033 < 0) {
2034 startBaseIdx = baseIdx;
2035 }
2036 }
2037
2038 // create a new block
2039 word0 = pool->getPool(startBaseIdx);
2040 pool->setPool(startBaseIdx, word0->next);
2041 word0->next = NULL;
2042 blk = new TextBlock(this, rot);
2043 blk->addWord(word0);
2044
2045 fontSize = word0->fontSize;
2046 minBase = maxBase = word0->base;
2047 colSpace1 = minColSpacing1 * fontSize;
2048 colSpace2 = minColSpacing2 * fontSize;
2049 lineSpace = maxLineSpacingDelta * fontSize;
2050 intraLineSpace = maxIntraLineDelta * fontSize;
2051
2052 // add words to the block
2053 do {
2054 found = gFalse;
2055
2056 // look for words on the line above the current top edge of
2057 // the block
2058 newMinBase = minBase;
2059 for (baseIdx = pool->getBaseIdx(minBase);
2060 baseIdx >= pool->getBaseIdx(minBase - lineSpace);
2061 --baseIdx) {
2062 word0 = NULL;
2063 word1 = pool->getPool(baseIdx);
2064 while (word1) {
2065 if (word1->base < minBase &&
2066 word1->base >= minBase - lineSpace &&
2067 ((rot == 0 || rot == 2)
2068 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2069 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2070 fabs(word1->fontSize - fontSize) <
2071 maxBlockFontSizeDelta1 * fontSize) {
2072 word2 = word1;
2073 if (word0) {
2074 word0->next = word1->next;
2075 } else {
2076 pool->setPool(baseIdx, word1->next);
2077 }
2078 word1 = word1->next;
2079 word2->next = NULL;
2080 blk->addWord(word2);
2081 found = gTrue;
2082 newMinBase = word2->base;
2083 } else {
2084 word0 = word1;
2085 word1 = word1->next;
2086 }
2087 }
2088 }
2089 minBase = newMinBase;
2090
2091 // look for words on the line below the current bottom edge of
2092 // the block
2093 newMaxBase = maxBase;
2094 for (baseIdx = pool->getBaseIdx(maxBase);
2095 baseIdx <= pool->getBaseIdx(maxBase + lineSpace);
2096 ++baseIdx) {
2097 word0 = NULL;
2098 word1 = pool->getPool(baseIdx);
2099 while (word1) {
2100 if (word1->base > maxBase &&
2101 word1->base <= maxBase + lineSpace &&
2102 ((rot == 0 || rot == 2)
2103 ? (word1->xMin < blk->xMax && word1->xMax > blk->xMin)
2104 : (word1->yMin < blk->yMax && word1->yMax > blk->yMin)) &&
2105 fabs(word1->fontSize - fontSize) <
2106 maxBlockFontSizeDelta1 * fontSize) {
2107 word2 = word1;
2108 if (word0) {
2109 word0->next = word1->next;
2110 } else {
2111 pool->setPool(baseIdx, word1->next);
2112 }
2113 word1 = word1->next;
2114 word2->next = NULL;
2115 blk->addWord(word2);
2116 found = gTrue;
2117 newMaxBase = word2->base;
2118 } else {
2119 word0 = word1;
2120 word1 = word1->next;
2121 }
2122 }
2123 }
2124 maxBase = newMaxBase;
2125
2126 // look for words that are on lines already in the block, and
2127 // that overlap the block horizontally
2128 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2129 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2130 ++baseIdx) {
2131 word0 = NULL;
2132 word1 = pool->getPool(baseIdx);
2133 while (word1) {
2134 if (word1->base >= minBase - intraLineSpace &&
2135 word1->base <= maxBase + intraLineSpace &&
2136 ((rot == 0 || rot == 2)
2137 ? (word1->xMin < blk->xMax + colSpace1 &&
2138 word1->xMax > blk->xMin - colSpace1)
2139 : (word1->yMin < blk->yMax + colSpace1 &&
2140 word1->yMax > blk->yMin - colSpace1)) &&
2141 fabs(word1->fontSize - fontSize) <
2142 maxBlockFontSizeDelta2 * fontSize) {
2143 word2 = word1;
2144 if (word0) {
2145 word0->next = word1->next;
2146 } else {
2147 pool->setPool(baseIdx, word1->next);
2148 }
2149 word1 = word1->next;
2150 word2->next = NULL;
2151 blk->addWord(word2);
2152 found = gTrue;
2153 } else {
2154 word0 = word1;
2155 word1 = word1->next;
2156 }
2157 }
2158 }
2159
2160 // only check for outlying words (the next two chunks of code)
2161 // if we didn't find anything else
2162 if (found) {
2163 continue;
2164 }
2165
2166 // scan down the left side of the block, looking for words
2167 // that are near (but not overlapping) the block; if there are
2168 // three or fewer, add them to the block
2169 n = 0;
2170 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2171 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2172 ++baseIdx) {
2173 word1 = pool->getPool(baseIdx);
2174 while (word1) {
2175 if (word1->base >= minBase - intraLineSpace &&
2176 word1->base <= maxBase + intraLineSpace &&
2177 ((rot == 0 || rot == 2)
2178 ? (word1->xMax <= blk->xMin &&
2179 word1->xMax > blk->xMin - colSpace2)
2180 : (word1->yMax <= blk->yMin &&
2181 word1->yMax > blk->yMin - colSpace2)) &&
2182 fabs(word1->fontSize - fontSize) <
2183 maxBlockFontSizeDelta3 * fontSize) {
2184 ++n;
2185 break;
2186 }
2187 word1 = word1->next;
2188 }
2189 }
2190 if (n > 0 && n <= 3) {
2191 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2192 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2193 ++baseIdx) {
2194 word0 = NULL;
2195 word1 = pool->getPool(baseIdx);
2196 while (word1) {
2197 if (word1->base >= minBase - intraLineSpace &&
2198 word1->base <= maxBase + intraLineSpace &&
2199 ((rot == 0 || rot == 2)
2200 ? (word1->xMax <= blk->xMin &&
2201 word1->xMax > blk->xMin - colSpace2)
2202 : (word1->yMax <= blk->yMin &&
2203 word1->yMax > blk->yMin - colSpace2)) &&
2204 fabs(word1->fontSize - fontSize) <
2205 maxBlockFontSizeDelta3 * fontSize) {
2206 word2 = word1;
2207 if (word0) {
2208 word0->next = word1->next;
2209 } else {
2210 pool->setPool(baseIdx, word1->next);
2211 }
2212 word1 = word1->next;
2213 word2->next = NULL;
2214 blk->addWord(word2);
2215 if (word2->base < minBase) {
2216 minBase = word2->base;
2217 } else if (word2->base > maxBase) {
2218 maxBase = word2->base;
2219 }
2220 found = gTrue;
2221 break;
2222 } else {
2223 word0 = word1;
2224 word1 = word1->next;
2225 }
2226 }
2227 }
2228 }
2229
2230 // scan down the right side of the block, looking for words
2231 // that are near (but not overlapping) the block; if there are
2232 // three or fewer, add them to the block
2233 n = 0;
2234 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2235 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2236 ++baseIdx) {
2237 word1 = pool->getPool(baseIdx);
2238 while (word1) {
2239 if (word1->base >= minBase - intraLineSpace &&
2240 word1->base <= maxBase + intraLineSpace &&
2241 ((rot == 0 || rot == 2)
2242 ? (word1->xMin >= blk->xMax &&
2243 word1->xMin < blk->xMax + colSpace2)
2244 : (word1->yMin >= blk->yMax &&
2245 word1->yMin < blk->yMax + colSpace2)) &&
2246 fabs(word1->fontSize - fontSize) <
2247 maxBlockFontSizeDelta3 * fontSize) {
2248 ++n;
2249 break;
2250 }
2251 word1 = word1->next;
2252 }
2253 }
2254 if (n > 0 && n <= 3) {
2255 for (baseIdx = pool->getBaseIdx(minBase - intraLineSpace);
2256 baseIdx <= pool->getBaseIdx(maxBase + intraLineSpace);
2257 ++baseIdx) {
2258 word0 = NULL;
2259 word1 = pool->getPool(baseIdx);
2260 while (word1) {
2261 if (word1->base >= minBase - intraLineSpace &&
2262 word1->base <= maxBase + intraLineSpace &&
2263 ((rot == 0 || rot == 2)
2264 ? (word1->xMin >= blk->xMax &&
2265 word1->xMin < blk->xMax + colSpace2)
2266 : (word1->yMin >= blk->yMax &&
2267 word1->yMin < blk->yMax + colSpace2)) &&
2268 fabs(word1->fontSize - fontSize) <
2269 maxBlockFontSizeDelta3 * fontSize) {
2270 word2 = word1;
2271 if (word0) {
2272 word0->next = word1->next;
2273 } else {
2274 pool->setPool(baseIdx, word1->next);
2275 }
2276 word1 = word1->next;
2277 word2->next = NULL;
2278 blk->addWord(word2);
2279 if (word2->base < minBase) {
2280 minBase = word2->base;
2281 } else if (word2->base > maxBase) {
2282 maxBase = word2->base;
2283 }
2284 found = gTrue;
2285 break;
2286 } else {
2287 word0 = word1;
2288 word1 = word1->next;
2289 }
2290 }
2291 }
2292 }
2293
2294 } while (found);
2295
2296 //~ need to compute the primary writing mode (horiz/vert) in
2297 //~ addition to primary rotation
2298
2299 // coalesce the block, and add it to the list
2300 blk->coalesce(uMap);
2301 if (lastBlk) {
2302 lastBlk->next = blk;
2303 } else {
2304 blkList = blk;
2305 }
2306 lastBlk = blk;
2307 count[rot] += blk->charCount;
2308 if (primaryRot < 0 || count[rot] > count[primaryRot]) {
2309 primaryRot = rot;
2310 }
2311 ++nBlocks;
2312 }
2313 }
2314
2315 #if 0 // for debugging
2316 printf("*** rotation ***\n");
2317 for (rot = 0; rot < 4; ++rot) {
2318 printf(" %d: %6d\n", rot, count[rot]);
2319 }
2320 printf(" primary rot = %d\n", primaryRot);
2321 printf("\n");
2322 #endif
2323
2324 #if 0 // for debugging
2325 printf("*** blocks ***\n");
2326 for (blk = blkList; blk; blk = blk->next) {
2327 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f\n",
2328 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax);
2329 for (line = blk->lines; line; line = line->next) {
2330 printf(" line: x=%.2f..%.2f y=%.2f..%.2f base=%.2f\n",
2331 line->xMin, line->xMax, line->yMin, line->yMax, line->base);
2332 for (word0 = line->words; word0; word0 = word0->next) {
2333 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2334 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2335 word0->base, word0->fontSize, word0->spaceAfter);
2336 for (i = 0; i < word0->len; ++i) {
2337 fputc(word0->text[i] & 0xff, stdout);
2338 }
2339 printf("'\n");
2340 }
2341 }
2342 }
2343 printf("\n");
2344 #endif
2345
2346 // determine the primary direction
2347 lrCount = 0;
2348 for (blk = blkList; blk; blk = blk->next) {
2349 for (line = blk->lines; line; line = line->next) {
2350 for (word0 = line->words; word0; word0 = word0->next) {
2351 for (i = 0; i < word0->len; ++i) {
2352 if (unicodeTypeL(word0->text[i])) {
2353 ++lrCount;
2354 } else if (unicodeTypeR(word0->text[i])) {
2355 --lrCount;
2356 }
2357 }
2358 }
2359 }
2360 }
2361 primaryLR = lrCount >= 0;
2362
2363 #if 0 // for debugging
2364 printf("*** direction ***\n");
2365 printf("lrCount = %d\n", lrCount);
2366 printf("primaryLR = %d\n", primaryLR);
2367 #endif
2368
2369 //----- column assignment
2370
2371 // sort blocks into xy order for column assignment
2372 blocks = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
2373 for (blk = blkList, i = 0; blk; blk = blk->next, ++i) {
2374 blocks[i] = blk;
2375 }
2376 qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpXYPrimaryRot);
2377
2378 // column assignment
2379 for (i = 0; i < nBlocks; ++i) {
2380 blk0 = blocks[i];
2381 col1 = 0;
2382 for (j = 0; j < i; ++j) {
2383 blk1 = blocks[j];
2384 col2 = 0; // make gcc happy
2385 switch (primaryRot) {
2386 case 0:
2387 if (blk0->xMin > blk1->xMax) {
2388 col2 = blk1->col + blk1->nColumns + 3;
2389 } else if (blk1->xMax == blk1->xMin) {
2390 col2 = blk1->col;
2391 } else {
2392 col2 = blk1->col + (int)(((blk0->xMin - blk1->xMin) /
2393 (blk1->xMax - blk1->xMin)) *
2394 blk1->nColumns);
2395 }
2396 break;
2397 case 1:
2398 if (blk0->yMin > blk1->yMax) {
2399 col2 = blk1->col + blk1->nColumns + 3;
2400 } else if (blk1->yMax == blk1->yMin) {
2401 col2 = blk1->col;
2402 } else {
2403 col2 = blk1->col + (int)(((blk0->yMin - blk1->yMin) /
2404 (blk1->yMax - blk1->yMin)) *
2405 blk1->nColumns);
2406 }
2407 break;
2408 case 2:
2409 if (blk0->xMax < blk1->xMin) {
2410 col2 = blk1->col + blk1->nColumns + 3;
2411 } else if (blk1->xMin == blk1->xMax) {
2412 col2 = blk1->col;
2413 } else {
2414 col2 = blk1->col + (int)(((blk0->xMax - blk1->xMax) /
2415 (blk1->xMin - blk1->xMax)) *
2416 blk1->nColumns);
2417 }
2418 break;
2419 case 3:
2420 if (blk0->yMax < blk1->yMin) {
2421 col2 = blk1->col + blk1->nColumns + 3;
2422 } else if (blk1->yMin == blk1->yMax) {
2423 col2 = blk1->col;
2424 } else {
2425 col2 = blk1->col + (int)(((blk0->yMax - blk1->yMax) /
2426 (blk1->yMin - blk1->yMax)) *
2427 blk1->nColumns);
2428 }
2429 break;
2430 }
2431 if (col2 > col1) {
2432 col1 = col2;
2433 }
2434 }
2435 blk0->col = col1;
2436 for (line = blk0->lines; line; line = line->next) {
2437 for (j = 0; j <= line->len; ++j) {
2438 line->col[j] += col1;
2439 }
2440 }
2441 }
2442
2443 #if 0 // for debugging
2444 printf("*** blocks, after column assignment ***\n");
2445 for (blk = blkList; blk; blk = blk->next) {
2446 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f col=%d nCols=%d\n",
2447 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax, blk->col,
2448 blk->nColumns);
2449 for (line = blk->lines; line; line = line->next) {
2450 printf(" line:\n");
2451 for (word0 = line->words; word0; word0 = word0->next) {
2452 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2453 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2454 word0->base, word0->fontSize, word0->spaceAfter);
2455 for (i = 0; i < word0->len; ++i) {
2456 fputc(word0->text[i] & 0xff, stdout);
2457 }
2458 printf("'\n");
2459 }
2460 }
2461 }
2462 printf("\n");
2463 #endif
2464
2465 //----- reading order sort
2466
2467 // sort blocks into yx order (in preparation for reading order sort)
2468 qsort(blocks, nBlocks, sizeof(TextBlock *), &TextBlock::cmpYXPrimaryRot);
2469
2470 // compute space on left and right sides of each block
2471 for (i = 0; i < nBlocks; ++i) {
2472 blk0 = blocks[i];
2473 for (j = 0; j < nBlocks; ++j) {
2474 blk1 = blocks[j];
2475 if (blk1 != blk0) {
2476 blk0->updatePriMinMax(blk1);
2477 }
2478 }
2479 }
2480
2481 #if 0 // for debugging
2482 printf("*** blocks, after yx sort ***\n");
2483 for (i = 0; i < nBlocks; ++i) {
2484 blk = blocks[i];
2485 printf("block: rot=%d x=%.2f..%.2f y=%.2f..%.2f space=%.2f..%.2f\n",
2486 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2487 blk->priMin, blk->priMax);
2488 for (line = blk->lines; line; line = line->next) {
2489 printf(" line:\n");
2490 for (word0 = line->words; word0; word0 = word0->next) {
2491 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2492 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2493 word0->base, word0->fontSize, word0->spaceAfter);
2494 for (j = 0; j < word0->len; ++j) {
2495 fputc(word0->text[j] & 0xff, stdout);
2496 }
2497 printf("'\n");
2498 }
2499 }
2500 }
2501 printf("\n");
2502 #endif
2503
2504 // build the flows
2505 //~ this needs to be adjusted for writing mode (vertical text)
2506 //~ this also needs to account for right-to-left column ordering
2507 blkArray = (TextBlock **)gmallocn(nBlocks, sizeof(TextBlock *));
2508 memcpy(blkArray, blocks, nBlocks * sizeof(TextBlock *));
2509 flows = lastFlow = NULL;
2510 firstBlkIdx = 0;
2511 nBlocksLeft = nBlocks;
2512 while (nBlocksLeft > 0) {
2513
2514 // find the upper-left-most block
2515 for (; !blkArray[firstBlkIdx]; ++firstBlkIdx) ;
2516 i = firstBlkIdx;
2517 blk = blkArray[i];
2518 for (j = firstBlkIdx + 1; j < nBlocks; ++j) {
2519 blk1 = blkArray[j];
2520 if (blk1) {
2521 if (blk && blk->secondaryDelta(blk1) > 0) {
2522 break;
2523 }
2524 if (blk1->primaryCmp(blk) < 0) {
2525 i = j;
2526 blk = blk1;
2527 }
2528 }
2529 }
2530 blkArray[i] = NULL;
2531 --nBlocksLeft;
2532 blk->next = NULL;
2533
2534 // create a new flow, starting with the upper-left-most block
2535 flow = new TextFlow(this, blk);
2536 if (lastFlow) {
2537 lastFlow->next = flow;
2538 } else {
2539 flows = flow;
2540 }
2541 lastFlow = flow;
2542 fontSize = blk->lines->words->fontSize;
2543
2544 // push the upper-left-most block on the stack
2545 blk->stackNext = NULL;
2546 blkStack = blk;
2547
2548 // find the other blocks in this flow
2549 while (blkStack) {
2550
2551 // find the upper-left-most block under (but within
2552 // maxBlockSpacing of) the top block on the stack
2553 blkSpace = maxBlockSpacing * blkStack->lines->words->fontSize;
2554 blk = NULL;
2555 i = -1;
2556 for (j = firstBlkIdx; j < nBlocks; ++j) {
2557 blk1 = blkArray[j];
2558 if (blk1) {
2559 if (blkStack->secondaryDelta(blk1) > blkSpace) {
2560 break;
2561 }
2562 if (blk && blk->secondaryDelta(blk1) > 0) {
2563 break;
2564 }
2565 if (blk1->isBelow(blkStack) &&
2566 (!blk || blk1->primaryCmp(blk) < 0)) {
2567 i = j;
2568 blk = blk1;
2569 }
2570 }
2571 }
2572
2573 // if a suitable block was found, add it to the flow and push it
2574 // onto the stack
2575 if (blk && flow->blockFits(blk, blkStack)) {
2576 blkArray[i] = NULL;
2577 --nBlocksLeft;
2578 blk->next = NULL;
2579 flow->addBlock(blk);
2580 fontSize = blk->lines->words->fontSize;
2581 blk->stackNext = blkStack;
2582 blkStack = blk;
2583
2584 // otherwise (if there is no block under the top block or the
2585 // block is not suitable), pop the stack
2586 } else {
2587 blkStack = blkStack->stackNext;
2588 }
2589 }
2590 }
2591 gfree(blkArray);
2592
2593 #if 0 // for debugging
2594 printf("*** flows ***\n");
2595 for (flow = flows; flow; flow = flow->next) {
2596 printf("flow: x=%.2f..%.2f y=%.2f..%.2f pri:%.2f..%.2f\n",
2597 flow->xMin, flow->xMax, flow->yMin, flow->yMax,
2598 flow->priMin, flow->priMax);
2599 for (blk = flow->blocks; blk; blk = blk->next) {
2600 printf(" block: rot=%d x=%.2f..%.2f y=%.2f..%.2f pri=%.2f..%.2f\n",
2601 blk->rot, blk->xMin, blk->xMax, blk->yMin, blk->yMax,
2602 blk->priMin, blk->priMax);
2603 for (line = blk->lines; line; line = line->next) {
2604 printf(" line:\n");
2605 for (word0 = line->words; word0; word0 = word0->next) {
2606 printf(" word: x=%.2f..%.2f y=%.2f..%.2f base=%.2f fontSize=%.2f space=%d: '",
2607 word0->xMin, word0->xMax, word0->yMin, word0->yMax,
2608 word0->base, word0->fontSize, word0->spaceAfter);
2609 for (i = 0; i < word0->len; ++i) {
2610 fputc(word0->text[i] & 0xff, stdout);
2611 }
2612 printf("'\n");
2613 }
2614 }
2615 }
2616 }
2617 printf("\n");
2618 #endif
2619
2620 if (uMap) {
2621 uMap->decRefCnt();
2622 }
2623 }
2624
findText(Unicode * s,int len,GBool startAtTop,GBool stopAtBottom,GBool startAtLast,GBool stopAtLast,GBool caseSensitive,GBool backward,double * xMin,double * yMin,double * xMax,double * yMax)2625 GBool TextPage::findText(Unicode *s, int len,
2626 GBool startAtTop, GBool stopAtBottom,
2627 GBool startAtLast, GBool stopAtLast,
2628 GBool caseSensitive, GBool backward,
2629 double *xMin, double *yMin,
2630 double *xMax, double *yMax) {
2631 TextBlock *blk;
2632 TextLine *line;
2633 Unicode *s2, *txt;
2634 Unicode *p;
2635 int txtSize, m, i, j, k;
2636 double xStart, yStart, xStop, yStop;
2637 double xMin0, yMin0, xMax0, yMax0;
2638 double xMin1, yMin1, xMax1, yMax1;
2639 GBool found;
2640
2641 //~ needs to handle right-to-left text
2642
2643 if (rawOrder) {
2644 return gFalse;
2645 }
2646
2647 // convert the search string to uppercase
2648 if (!caseSensitive) {
2649 s2 = (Unicode *)gmallocn(len, sizeof(Unicode));
2650 for (i = 0; i < len; ++i) {
2651 s2[i] = unicodeToUpper(s[i]);
2652 }
2653 } else {
2654 s2 = s;
2655 }
2656
2657 txt = NULL;
2658 txtSize = 0;
2659
2660 xStart = yStart = xStop = yStop = 0;
2661 if (startAtLast && haveLastFind) {
2662 xStart = lastFindXMin;
2663 yStart = lastFindYMin;
2664 } else if (!startAtTop) {
2665 xStart = *xMin;
2666 yStart = *yMin;
2667 }
2668 if (stopAtLast && haveLastFind) {
2669 xStop = lastFindXMin;
2670 yStop = lastFindYMin;
2671 } else if (!stopAtBottom) {
2672 xStop = *xMax;
2673 yStop = *yMax;
2674 }
2675
2676 found = gFalse;
2677 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
2678 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
2679
2680 for (i = backward ? nBlocks - 1 : 0;
2681 backward ? i >= 0 : i < nBlocks;
2682 i += backward ? -1 : 1) {
2683 blk = blocks[i];
2684
2685 // check: is the block above the top limit?
2686 if (!startAtTop && (backward ? blk->yMin > yStart : blk->yMax < yStart)) {
2687 continue;
2688 }
2689
2690 // check: is the block below the bottom limit?
2691 if (!stopAtBottom && (backward ? blk->yMax < yStop : blk->yMin > yStop)) {
2692 break;
2693 }
2694
2695 for (line = blk->lines; line; line = line->next) {
2696
2697 // check: is the line above the top limit?
2698 if (!startAtTop &&
2699 (backward ? line->yMin > yStart : line->yMin < yStart)) {
2700 continue;
2701 }
2702
2703 // check: is the line below the bottom limit?
2704 if (!stopAtBottom &&
2705 (backward ? line->yMin < yStop : line->yMin > yStop)) {
2706 continue;
2707 }
2708
2709 // convert the line to uppercase
2710 m = line->len;
2711 if (!caseSensitive) {
2712 if (m > txtSize) {
2713 txt = (Unicode *)greallocn(txt, m, sizeof(Unicode));
2714 txtSize = m;
2715 }
2716 for (k = 0; k < m; ++k) {
2717 txt[k] = unicodeToUpper(line->text[k]);
2718 }
2719 } else {
2720 txt = line->text;
2721 }
2722
2723 // search each position in this line
2724 j = backward ? m - len : 0;
2725 p = txt + j;
2726 while (backward ? j >= 0 : j <= m - len) {
2727
2728 // compare the strings
2729 for (k = 0; k < len; ++k) {
2730 if (p[k] != s2[k]) {
2731 break;
2732 }
2733 }
2734
2735 // found it
2736 if (k == len) {
2737 switch (line->rot) {
2738 case 0:
2739 xMin1 = line->edge[j];
2740 xMax1 = line->edge[j + len];
2741 yMin1 = line->yMin;
2742 yMax1 = line->yMax;
2743 break;
2744 case 1:
2745 xMin1 = line->xMin;
2746 xMax1 = line->xMax;
2747 yMin1 = line->edge[j];
2748 yMax1 = line->edge[j + len];
2749 break;
2750 case 2:
2751 xMin1 = line->edge[j + len];
2752 xMax1 = line->edge[j];
2753 yMin1 = line->yMin;
2754 yMax1 = line->yMax;
2755 break;
2756 case 3:
2757 xMin1 = line->xMin;
2758 xMax1 = line->xMax;
2759 yMin1 = line->edge[j + len];
2760 yMax1 = line->edge[j];
2761 break;
2762 }
2763 if (backward) {
2764 if ((startAtTop ||
2765 yMin1 < yStart || (yMin1 == yStart && xMin1 < xStart)) &&
2766 (stopAtBottom ||
2767 yMin1 > yStop || (yMin1 == yStop && xMin1 > xStop))) {
2768 if (!found ||
2769 yMin1 > yMin0 || (yMin1 == yMin0 && xMin1 > xMin0)) {
2770 xMin0 = xMin1;
2771 xMax0 = xMax1;
2772 yMin0 = yMin1;
2773 yMax0 = yMax1;
2774 found = gTrue;
2775 }
2776 }
2777 } else {
2778 if ((startAtTop ||
2779 yMin1 > yStart || (yMin1 == yStart && xMin1 > xStart)) &&
2780 (stopAtBottom ||
2781 yMin1 < yStop || (yMin1 == yStop && xMin1 < xStop))) {
2782 if (!found ||
2783 yMin1 < yMin0 || (yMin1 == yMin0 && xMin1 < xMin0)) {
2784 xMin0 = xMin1;
2785 xMax0 = xMax1;
2786 yMin0 = yMin1;
2787 yMax0 = yMax1;
2788 found = gTrue;
2789 }
2790 }
2791 }
2792 }
2793 if (backward) {
2794 --j;
2795 --p;
2796 } else {
2797 ++j;
2798 ++p;
2799 }
2800 }
2801 }
2802 }
2803
2804 if (!caseSensitive) {
2805 gfree(s2);
2806 gfree(txt);
2807 }
2808
2809 if (found) {
2810 *xMin = xMin0;
2811 *xMax = xMax0;
2812 *yMin = yMin0;
2813 *yMax = yMax0;
2814 lastFindXMin = xMin0;
2815 lastFindYMin = yMin0;
2816 haveLastFind = gTrue;
2817 return gTrue;
2818 }
2819
2820 return gFalse;
2821 }
2822
getText(double xMin,double yMin,double xMax,double yMax)2823 GString *TextPage::getText(double xMin, double yMin,
2824 double xMax, double yMax) {
2825 GString *s;
2826 UnicodeMap *uMap;
2827 GBool isUnicode;
2828 TextBlock *blk;
2829 TextLine *line;
2830 TextLineFrag *frags;
2831 int nFrags, fragsSize;
2832 TextLineFrag *frag;
2833 char space[8], eol[16];
2834 int spaceLen, eolLen;
2835 int lastRot;
2836 double x, y;
2837 int col, idx0, idx1, i, j;
2838 GBool multiLine, oneRot;
2839
2840 s = new GString();
2841
2842 if (rawOrder) {
2843 return s;
2844 }
2845
2846 // get the output encoding
2847 if (!(uMap = globalParams->getTextEncoding())) {
2848 return s;
2849 }
2850 isUnicode = uMap->isUnicode();
2851 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
2852 eolLen = 0; // make gcc happy
2853 switch (globalParams->getTextEOL()) {
2854 case eolUnix:
2855 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
2856 break;
2857 case eolDOS:
2858 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
2859 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
2860 break;
2861 case eolMac:
2862 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
2863 break;
2864 }
2865
2866 //~ writing mode (horiz/vert)
2867
2868 // collect the line fragments that are in the rectangle
2869 fragsSize = 256;
2870 frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
2871 nFrags = 0;
2872 lastRot = -1;
2873 oneRot = gTrue;
2874 for (i = 0; i < nBlocks; ++i) {
2875 blk = blocks[i];
2876 if (xMin < blk->xMax && blk->xMin < xMax &&
2877 yMin < blk->yMax && blk->yMin < yMax) {
2878 for (line = blk->lines; line; line = line->next) {
2879 if (xMin < line->xMax && line->xMin < xMax &&
2880 yMin < line->yMax && line->yMin < yMax) {
2881 idx0 = idx1 = -1;
2882 switch (line->rot) {
2883 case 0:
2884 y = 0.5 * (line->yMin + line->yMax);
2885 if (yMin < y && y < yMax) {
2886 j = 0;
2887 while (j < line->len) {
2888 if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
2889 idx0 = j;
2890 break;
2891 }
2892 ++j;
2893 }
2894 j = line->len - 1;
2895 while (j >= 0) {
2896 if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
2897 idx1 = j;
2898 break;
2899 }
2900 --j;
2901 }
2902 }
2903 break;
2904 case 1:
2905 x = 0.5 * (line->xMin + line->xMax);
2906 if (xMin < x && x < xMax) {
2907 j = 0;
2908 while (j < line->len) {
2909 if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
2910 idx0 = j;
2911 break;
2912 }
2913 ++j;
2914 }
2915 j = line->len - 1;
2916 while (j >= 0) {
2917 if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
2918 idx1 = j;
2919 break;
2920 }
2921 --j;
2922 }
2923 }
2924 break;
2925 case 2:
2926 y = 0.5 * (line->yMin + line->yMax);
2927 if (yMin < y && y < yMax) {
2928 j = 0;
2929 while (j < line->len) {
2930 if (0.5 * (line->edge[j] + line->edge[j+1]) < xMax) {
2931 idx0 = j;
2932 break;
2933 }
2934 ++j;
2935 }
2936 j = line->len - 1;
2937 while (j >= 0) {
2938 if (0.5 * (line->edge[j] + line->edge[j+1]) > xMin) {
2939 idx1 = j;
2940 break;
2941 }
2942 --j;
2943 }
2944 }
2945 break;
2946 case 3:
2947 x = 0.5 * (line->xMin + line->xMax);
2948 if (xMin < x && x < xMax) {
2949 j = 0;
2950 while (j < line->len) {
2951 if (0.5 * (line->edge[j] + line->edge[j+1]) < yMax) {
2952 idx0 = j;
2953 break;
2954 }
2955 ++j;
2956 }
2957 j = line->len - 1;
2958 while (j >= 0) {
2959 if (0.5 * (line->edge[j] + line->edge[j+1]) > yMin) {
2960 idx1 = j;
2961 break;
2962 }
2963 --j;
2964 }
2965 }
2966 break;
2967 }
2968 if (idx0 >= 0 && idx1 >= 0) {
2969 if (nFrags == fragsSize) {
2970 fragsSize *= 2;
2971 frags = (TextLineFrag *)
2972 greallocn(frags, fragsSize, sizeof(TextLineFrag));
2973 }
2974 frags[nFrags].init(line, idx0, idx1 - idx0 + 1);
2975 ++nFrags;
2976 if (lastRot >= 0 && line->rot != lastRot) {
2977 oneRot = gFalse;
2978 }
2979 lastRot = line->rot;
2980 }
2981 }
2982 }
2983 }
2984 }
2985
2986 // sort the fragments and generate the string
2987 if (nFrags > 0) {
2988
2989 for (i = 0; i < nFrags; ++i) {
2990 frags[i].computeCoords(oneRot);
2991 }
2992 assignColumns(frags, nFrags, oneRot);
2993
2994 // if all lines in the region have the same rotation, use it;
2995 // otherwise, use the page's primary rotation
2996 if (oneRot) {
2997 qsort(frags, nFrags, sizeof(TextLineFrag),
2998 &TextLineFrag::cmpYXLineRot);
2999 } else {
3000 qsort(frags, nFrags, sizeof(TextLineFrag),
3001 &TextLineFrag::cmpYXPrimaryRot);
3002 }
3003
3004 col = 0;
3005 multiLine = gFalse;
3006 for (i = 0; i < nFrags; ++i) {
3007 frag = &frags[i];
3008
3009 // insert a return
3010 if (frag->col < col ||
3011 (i > 0 && fabs(frag->base - frags[i-1].base) >
3012 maxIntraLineDelta * frags[i-1].line->words->fontSize)) {
3013 s->append(eol, eolLen);
3014 col = 0;
3015 multiLine = gTrue;
3016 }
3017
3018 // column alignment
3019 for (; col < frag->col; ++col) {
3020 s->append(space, spaceLen);
3021 }
3022
3023 // get the fragment text
3024 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3025 }
3026
3027 if (multiLine) {
3028 s->append(eol, eolLen);
3029 }
3030 }
3031
3032 gfree(frags);
3033 uMap->decRefCnt();
3034
3035 return s;
3036 }
3037
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax)3038 GBool TextPage::findCharRange(int pos, int length,
3039 double *xMin, double *yMin,
3040 double *xMax, double *yMax) {
3041 TextBlock *blk;
3042 TextLine *line;
3043 TextWord *word;
3044 double xMin0, xMax0, yMin0, yMax0;
3045 double xMin1, xMax1, yMin1, yMax1;
3046 GBool first;
3047 int i, j0, j1;
3048
3049 if (rawOrder) {
3050 return gFalse;
3051 }
3052
3053 //~ this doesn't correctly handle:
3054 //~ - ranges split across multiple lines (the highlighted region
3055 //~ is the bounding box of all the parts of the range)
3056 //~ - cases where characters don't convert one-to-one into Unicode
3057 first = gTrue;
3058 xMin0 = xMax0 = yMin0 = yMax0 = 0; // make gcc happy
3059 xMin1 = xMax1 = yMin1 = yMax1 = 0; // make gcc happy
3060 for (i = 0; i < nBlocks; ++i) {
3061 blk = blocks[i];
3062 for (line = blk->lines; line; line = line->next) {
3063 for (word = line->words; word; word = word->next) {
3064 if (pos < word->charPos + word->charLen &&
3065 word->charPos < pos + length) {
3066 j0 = pos - word->charPos;
3067 if (j0 < 0) {
3068 j0 = 0;
3069 }
3070 j1 = pos + length - 1 - word->charPos;
3071 if (j1 >= word->len) {
3072 j1 = word->len - 1;
3073 }
3074 switch (line->rot) {
3075 case 0:
3076 xMin1 = word->edge[j0];
3077 xMax1 = word->edge[j1 + 1];
3078 yMin1 = word->yMin;
3079 yMax1 = word->yMax;
3080 break;
3081 case 1:
3082 xMin1 = word->xMin;
3083 xMax1 = word->xMax;
3084 yMin1 = word->edge[j0];
3085 yMax1 = word->edge[j1 + 1];
3086 break;
3087 case 2:
3088 xMin1 = word->edge[j1 + 1];
3089 xMax1 = word->edge[j0];
3090 yMin1 = word->yMin;
3091 yMax1 = word->yMax;
3092 break;
3093 case 3:
3094 xMin1 = word->xMin;
3095 xMax1 = word->xMax;
3096 yMin1 = word->edge[j1 + 1];
3097 yMax1 = word->edge[j0];
3098 break;
3099 }
3100 if (first || xMin1 < xMin0) {
3101 xMin0 = xMin1;
3102 }
3103 if (first || xMax1 > xMax0) {
3104 xMax0 = xMax1;
3105 }
3106 if (first || yMin1 < yMin0) {
3107 yMin0 = yMin1;
3108 }
3109 if (first || yMax1 > yMax0) {
3110 yMax0 = yMax1;
3111 }
3112 first = gFalse;
3113 }
3114 }
3115 }
3116 }
3117 if (!first) {
3118 *xMin = xMin0;
3119 *xMax = xMax0;
3120 *yMin = yMin0;
3121 *yMax = yMax0;
3122 return gTrue;
3123 }
3124 return gFalse;
3125 }
3126
dump(void * outputStream,TextOutputFunc outputFunc,GBool physLayout)3127 void TextPage::dump(void *outputStream, TextOutputFunc outputFunc,
3128 GBool physLayout) {
3129 UnicodeMap *uMap;
3130 TextFlow *flow;
3131 TextBlock *blk;
3132 TextLine *line;
3133 TextLineFrag *frags;
3134 TextWord *word;
3135 int nFrags, fragsSize;
3136 TextLineFrag *frag;
3137 char space[8], eol[16], eop[8];
3138 int spaceLen, eolLen, eopLen;
3139 GBool pageBreaks;
3140 GString *s;
3141 int col, i, d, n;
3142
3143 // get the output encoding
3144 if (!(uMap = globalParams->getTextEncoding())) {
3145 return;
3146 }
3147 spaceLen = uMap->mapUnicode(0x20, space, sizeof(space));
3148 eolLen = 0; // make gcc happy
3149 switch (globalParams->getTextEOL()) {
3150 case eolUnix:
3151 eolLen = uMap->mapUnicode(0x0a, eol, sizeof(eol));
3152 break;
3153 case eolDOS:
3154 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3155 eolLen += uMap->mapUnicode(0x0a, eol + eolLen, sizeof(eol) - eolLen);
3156 break;
3157 case eolMac:
3158 eolLen = uMap->mapUnicode(0x0d, eol, sizeof(eol));
3159 break;
3160 }
3161 eopLen = uMap->mapUnicode(0x0c, eop, sizeof(eop));
3162 pageBreaks = globalParams->getTextPageBreaks();
3163
3164 //~ writing mode (horiz/vert)
3165
3166 // output the page in raw (content stream) order
3167 if (rawOrder) {
3168
3169 for (word = rawWords; word; word = word->next) {
3170 s = new GString();
3171 dumpFragment(word->text, word->len, uMap, s);
3172 (*outputFunc)(outputStream, s->getCString(), s->getLength());
3173 delete s;
3174 if (word->next &&
3175 fabs(word->next->base - word->base) <
3176 maxIntraLineDelta * word->fontSize) {
3177 if (word->next->xMin > word->xMax + minWordSpacing * word->fontSize) {
3178 (*outputFunc)(outputStream, space, spaceLen);
3179 }
3180 } else {
3181 (*outputFunc)(outputStream, eol, eolLen);
3182 }
3183 }
3184
3185 // output the page, maintaining the original physical layout
3186 } else if (physLayout) {
3187
3188 // collect the line fragments for the page and sort them
3189 fragsSize = 256;
3190 frags = (TextLineFrag *)gmallocn(fragsSize, sizeof(TextLineFrag));
3191 nFrags = 0;
3192 for (i = 0; i < nBlocks; ++i) {
3193 blk = blocks[i];
3194 for (line = blk->lines; line; line = line->next) {
3195 if (nFrags == fragsSize) {
3196 fragsSize *= 2;
3197 frags = (TextLineFrag *)greallocn(frags,
3198 fragsSize, sizeof(TextLineFrag));
3199 }
3200 frags[nFrags].init(line, 0, line->len);
3201 frags[nFrags].computeCoords(gTrue);
3202 ++nFrags;
3203 }
3204 }
3205 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpYXPrimaryRot);
3206
3207 #if 0 // for debugging
3208 printf("*** line fragments ***\n");
3209 for (i = 0; i < nFrags; ++i) {
3210 frag = &frags[i];
3211 printf("frag: x=%.2f..%.2f y=%.2f..%.2f base=%.2f '",
3212 frag->xMin, frag->xMax, frag->yMin, frag->yMax, frag->base);
3213 for (n = 0; n < frag->len; ++n) {
3214 fputc(frag->line->text[frag->start + n] & 0xff, stdout);
3215 }
3216 printf("'\n");
3217 }
3218 printf("\n");
3219 #endif
3220
3221 // generate output
3222 col = 0;
3223 for (i = 0; i < nFrags; ++i) {
3224 frag = &frags[i];
3225
3226 // column alignment
3227 for (; col < frag->col; ++col) {
3228 (*outputFunc)(outputStream, space, spaceLen);
3229 }
3230
3231 // print the line
3232 s = new GString();
3233 col += dumpFragment(frag->line->text + frag->start, frag->len, uMap, s);
3234 (*outputFunc)(outputStream, s->getCString(), s->getLength());
3235 delete s;
3236
3237 // print one or more returns if necessary
3238 if (i == nFrags - 1 ||
3239 frags[i+1].col < col ||
3240 fabs(frags[i+1].base - frag->base) >
3241 maxIntraLineDelta * frag->line->words->fontSize) {
3242 if (i < nFrags - 1) {
3243 d = (int)((frags[i+1].base - frag->base) /
3244 frag->line->words->fontSize);
3245 if (d < 1) {
3246 d = 1;
3247 } else if (d > 5) {
3248 d = 5;
3249 }
3250 } else {
3251 d = 1;
3252 }
3253 for (; d > 0; --d) {
3254 (*outputFunc)(outputStream, eol, eolLen);
3255 }
3256 col = 0;
3257 }
3258 }
3259
3260 gfree(frags);
3261
3262 // output the page, "undoing" the layout
3263 } else {
3264 for (flow = flows; flow; flow = flow->next) {
3265 for (blk = flow->blocks; blk; blk = blk->next) {
3266 for (line = blk->lines; line; line = line->next) {
3267 n = line->len;
3268 if (line->hyphenated && (line->next || blk->next)) {
3269 --n;
3270 }
3271 s = new GString();
3272 dumpFragment(line->text, n, uMap, s);
3273 (*outputFunc)(outputStream, s->getCString(), s->getLength());
3274 delete s;
3275 if (!line->hyphenated) {
3276 if (line->next) {
3277 (*outputFunc)(outputStream, space, spaceLen);
3278 } else if (blk->next) {
3279 //~ this is a bit of a kludge - we should really do a more
3280 //~ intelligent determination of paragraphs
3281 if (blk->next->lines->words->fontSize ==
3282 blk->lines->words->fontSize) {
3283 (*outputFunc)(outputStream, space, spaceLen);
3284 } else {
3285 (*outputFunc)(outputStream, eol, eolLen);
3286 }
3287 }
3288 }
3289 }
3290 }
3291 (*outputFunc)(outputStream, eol, eolLen);
3292 (*outputFunc)(outputStream, eol, eolLen);
3293 }
3294 }
3295
3296 // end of page
3297 if (pageBreaks) {
3298 (*outputFunc)(outputStream, eop, eopLen);
3299 }
3300
3301 uMap->decRefCnt();
3302 }
3303
assignColumns(TextLineFrag * frags,int nFrags,GBool oneRot)3304 void TextPage::assignColumns(TextLineFrag *frags, int nFrags, GBool oneRot) {
3305 TextLineFrag *frag0, *frag1;
3306 int rot, col1, col2, i, j, k;
3307
3308 // all text in the region has the same rotation -- recompute the
3309 // column numbers based only on the text in the region
3310 if (oneRot) {
3311 qsort(frags, nFrags, sizeof(TextLineFrag), &TextLineFrag::cmpXYLineRot);
3312 rot = frags[0].line->rot;
3313 for (i = 0; i < nFrags; ++i) {
3314 frag0 = &frags[i];
3315 col1 = 0;
3316 for (j = 0; j < i; ++j) {
3317 frag1 = &frags[j];
3318 col2 = 0; // make gcc happy
3319 switch (rot) {
3320 case 0:
3321 if (frag0->xMin >= frag1->xMax) {
3322 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3323 frag1->line->col[frag1->start]) + 1;
3324 } else {
3325 for (k = frag1->start;
3326 k < frag1->start + frag1->len &&
3327 frag0->xMin >= 0.5 * (frag1->line->edge[k] +
3328 frag1->line->edge[k+1]);
3329 ++k) ;
3330 col2 = frag1->col +
3331 frag1->line->col[k] - frag1->line->col[frag1->start];
3332 }
3333 break;
3334 case 1:
3335 if (frag0->yMin >= frag1->yMax) {
3336 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3337 frag1->line->col[frag1->start]) + 1;
3338 } else {
3339 for (k = frag1->start;
3340 k < frag1->start + frag1->len &&
3341 frag0->yMin >= 0.5 * (frag1->line->edge[k] +
3342 frag1->line->edge[k+1]);
3343 ++k) ;
3344 col2 = frag1->col +
3345 frag1->line->col[k] - frag1->line->col[frag1->start];
3346 }
3347 break;
3348 case 2:
3349 if (frag0->xMax <= frag1->xMin) {
3350 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3351 frag1->line->col[frag1->start]) + 1;
3352 } else {
3353 for (k = frag1->start;
3354 k < frag1->start + frag1->len &&
3355 frag0->xMax <= 0.5 * (frag1->line->edge[k] +
3356 frag1->line->edge[k+1]);
3357 ++k) ;
3358 col2 = frag1->col +
3359 frag1->line->col[k] - frag1->line->col[frag1->start];
3360 }
3361 break;
3362 case 3:
3363 if (frag0->yMax <= frag1->yMin) {
3364 col2 = frag1->col + (frag1->line->col[frag1->start + frag1->len] -
3365 frag1->line->col[frag1->start]) + 1;
3366 } else {
3367 for (k = frag1->start;
3368 k < frag1->start + frag1->len &&
3369 frag0->yMax <= 0.5 * (frag1->line->edge[k] +
3370 frag1->line->edge[k+1]);
3371 ++k) ;
3372 col2 = frag1->col +
3373 frag1->line->col[k] - frag1->line->col[frag1->start];
3374 }
3375 break;
3376 }
3377 if (col2 > col1) {
3378 col1 = col2;
3379 }
3380 }
3381 frag0->col = col1;
3382 }
3383
3384 // the region includes text at different rotations -- use the
3385 // globally assigned column numbers, offset by the minimum column
3386 // number (i.e., shift everything over to column 0)
3387 } else {
3388 col1 = frags[0].col;
3389 for (i = 1; i < nFrags; ++i) {
3390 if (frags[i].col < col1) {
3391 col1 = frags[i].col;
3392 }
3393 }
3394 for (i = 0; i < nFrags; ++i) {
3395 frags[i].col -= col1;
3396 }
3397 }
3398 }
3399
dumpFragment(Unicode * text,int len,UnicodeMap * uMap,GString * s)3400 int TextPage::dumpFragment(Unicode *text, int len, UnicodeMap *uMap,
3401 GString *s) {
3402 char lre[8], rle[8], popdf[8], buf[8];
3403 int lreLen, rleLen, popdfLen, n;
3404 int nCols, i, j, k;
3405
3406 nCols = 0;
3407
3408 if (uMap->isUnicode()) {
3409
3410 lreLen = uMap->mapUnicode(0x202a, lre, sizeof(lre));
3411 rleLen = uMap->mapUnicode(0x202b, rle, sizeof(rle));
3412 popdfLen = uMap->mapUnicode(0x202c, popdf, sizeof(popdf));
3413
3414 if (primaryLR) {
3415
3416 i = 0;
3417 while (i < len) {
3418 // output a left-to-right section
3419 for (j = i; j < len && !unicodeTypeR(text[j]); ++j) ;
3420 for (k = i; k < j; ++k) {
3421 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3422 s->append(buf, n);
3423 ++nCols;
3424 }
3425 i = j;
3426 // output a right-to-left section
3427 for (j = i; j < len && !unicodeTypeL(text[j]); ++j) ;
3428 if (j > i) {
3429 s->append(rle, rleLen);
3430 for (k = j - 1; k >= i; --k) {
3431 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3432 s->append(buf, n);
3433 ++nCols;
3434 }
3435 s->append(popdf, popdfLen);
3436 i = j;
3437 }
3438 }
3439
3440 } else {
3441
3442 s->append(rle, rleLen);
3443 i = len - 1;
3444 while (i >= 0) {
3445 // output a right-to-left section
3446 for (j = i; j >= 0 && !unicodeTypeL(text[j]); --j) ;
3447 for (k = i; k > j; --k) {
3448 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3449 s->append(buf, n);
3450 ++nCols;
3451 }
3452 i = j;
3453 // output a left-to-right section
3454 for (j = i; j >= 0 && !unicodeTypeR(text[j]); --j) ;
3455 if (j < i) {
3456 s->append(lre, lreLen);
3457 for (k = j + 1; k <= i; ++k) {
3458 n = uMap->mapUnicode(text[k], buf, sizeof(buf));
3459 s->append(buf, n);
3460 ++nCols;
3461 }
3462 s->append(popdf, popdfLen);
3463 i = j;
3464 }
3465 }
3466 s->append(popdf, popdfLen);
3467
3468 }
3469
3470 } else {
3471 for (i = 0; i < len; ++i) {
3472 n = uMap->mapUnicode(text[i], buf, sizeof(buf));
3473 s->append(buf, n);
3474 nCols += n;
3475 }
3476 }
3477
3478 return nCols;
3479 }
3480
3481 #if TEXTOUT_WORD_LIST
makeWordList(GBool physLayout)3482 TextWordList *TextPage::makeWordList(GBool physLayout) {
3483 return new TextWordList(this, physLayout);
3484 }
3485 #endif
3486
3487 //------------------------------------------------------------------------
3488 // TextOutputDev
3489 //------------------------------------------------------------------------
3490
outputToFile(void * stream,char * text,int len)3491 static void outputToFile(void *stream, char *text, int len) {
3492 fwrite(text, 1, len, (FILE *)stream);
3493 }
3494
TextOutputDev(char * fileName,GBool physLayoutA,GBool rawOrderA,GBool append)3495 TextOutputDev::TextOutputDev(char *fileName, GBool physLayoutA,
3496 GBool rawOrderA, GBool append) {
3497 text = NULL;
3498 physLayout = physLayoutA;
3499 rawOrder = rawOrderA;
3500 ok = gTrue;
3501
3502 // open file
3503 needClose = gFalse;
3504 if (fileName) {
3505 if (!strcmp(fileName, "-")) {
3506 outputStream = stdout;
3507 #ifdef WIN32
3508 // keep DOS from munging the end-of-line characters
3509 setmode(fileno(stdout), O_BINARY);
3510 #endif
3511 } else if ((outputStream = fopen(fileName, append ? "ab" : "wb"))) {
3512 needClose = gTrue;
3513 } else {
3514 error(-1, "Couldn't open text file '%s'", fileName);
3515 ok = gFalse;
3516 return;
3517 }
3518 outputFunc = &outputToFile;
3519 } else {
3520 outputStream = NULL;
3521 }
3522
3523 // set up text object
3524 text = new TextPage(rawOrderA);
3525 }
3526
TextOutputDev(TextOutputFunc func,void * stream,GBool physLayoutA,GBool rawOrderA)3527 TextOutputDev::TextOutputDev(TextOutputFunc func, void *stream,
3528 GBool physLayoutA, GBool rawOrderA) {
3529 outputFunc = func;
3530 outputStream = stream;
3531 needClose = gFalse;
3532 physLayout = physLayoutA;
3533 rawOrder = rawOrderA;
3534 text = new TextPage(rawOrderA);
3535 ok = gTrue;
3536 }
3537
~TextOutputDev()3538 TextOutputDev::~TextOutputDev() {
3539 if (needClose) {
3540 #ifdef MACOS
3541 ICS_MapRefNumAndAssign((short)((FILE *)outputStream)->handle);
3542 #endif
3543 fclose((FILE *)outputStream);
3544 }
3545 if (text) {
3546 delete text;
3547 }
3548 }
3549
startPage(int pageNum,GfxState * state)3550 void TextOutputDev::startPage(int pageNum, GfxState *state)
3551 {
3552 text->startPage(state);
3553 }
3554
endPage()3555 void TextOutputDev::endPage() {
3556 text->endPage();
3557 text->coalesce(physLayout);
3558 if (outputStream) {
3559 text->dump(outputStream, outputFunc, physLayout);
3560 }
3561 }
3562
updateFont(GfxState * state)3563 void TextOutputDev::updateFont(GfxState *state) {
3564 text->updateFont(state);
3565 }
3566
beginString(GfxState * state,GString * s)3567 void TextOutputDev::beginString(GfxState *state, GString *s) {
3568 }
3569
endString(GfxState * state)3570 void TextOutputDev::endString(GfxState *state) {
3571 }
3572
drawChar(GfxState * state,double x,double y,double dx,double dy,double originX,double originY,CharCode c,int nBytes,Unicode * u,int uLen)3573 void TextOutputDev::drawChar(GfxState *state, double x, double y,
3574 double dx, double dy,
3575 double originX, double originY,
3576 CharCode c, int nBytes, Unicode *u, int uLen) {
3577 text->addChar(state, x, y, dx, dy, c, nBytes, u, uLen);
3578 }
3579
findText(Unicode * s,int len,GBool startAtTop,GBool stopAtBottom,GBool startAtLast,GBool stopAtLast,GBool caseSensitive,GBool backward,double * xMin,double * yMin,double * xMax,double * yMax)3580 GBool TextOutputDev::findText(Unicode *s, int len,
3581 GBool startAtTop, GBool stopAtBottom,
3582 GBool startAtLast, GBool stopAtLast,
3583 GBool caseSensitive, GBool backward,
3584 double *xMin, double *yMin,
3585 double *xMax, double *yMax) {
3586 return text->findText(s, len, startAtTop, stopAtBottom,
3587 startAtLast, stopAtLast, caseSensitive, backward,
3588 xMin, yMin, xMax, yMax);
3589 }
3590
getText(double xMin,double yMin,double xMax,double yMax)3591 GString *TextOutputDev::getText(double xMin, double yMin,
3592 double xMax, double yMax) {
3593 return text->getText(xMin, yMin, xMax, yMax);
3594 }
3595
findCharRange(int pos,int length,double * xMin,double * yMin,double * xMax,double * yMax)3596 GBool TextOutputDev::findCharRange(int pos, int length,
3597 double *xMin, double *yMin,
3598 double *xMax, double *yMax) {
3599 return text->findCharRange(pos, length, xMin, yMin, xMax, yMax);
3600 }
3601
3602 #if TEXTOUT_WORD_LIST
makeWordList()3603 TextWordList *TextOutputDev::makeWordList() {
3604 return text->makeWordList(physLayout);
3605 }
3606 #endif
3607
takeText()3608 TextPage *TextOutputDev::takeText() {
3609 TextPage *ret;
3610
3611 ret = text;
3612 text = new TextPage(rawOrder);
3613 return ret;
3614 }
3615