1 //========================================================================
2 //
3 // TextOutputDev.h
4 //
5 // Copyright 1997-2003 Glyph & Cog, LLC
6 //
7 //========================================================================
8 
9 //========================================================================
10 //
11 // Modified under the Poppler project - http://poppler.freedesktop.org
12 //
13 // All changes made under the Poppler project to this file are licensed
14 // under GPL version 2 or later
15 //
16 // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com>
17 // Copyright (C) 2006 Ed Catmur <ed@catmur.co.uk>
18 // Copyright (C) 2007, 2008, 2011, 2013 Carlos Garcia Campos <carlosgc@gnome.org>
19 // Copyright (C) 2007, 2017 Adrian Johnson <ajohnson@redneon.com>
20 // Copyright (C) 2008, 2010, 2015, 2016, 2018, 2019, 2021 Albert Astals Cid <aacid@kde.org>
21 // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com>
22 // Copyright (C) 2012, 2013, 2015, 2016 Jason Crain <jason@aquaticape.us>
23 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de>
24 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich
25 // Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com>
26 // Copyright (C) 2018, 2020, 2021 Nelson Benítez León <nbenitezl@gmail.com>
27 // Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de>
28 // Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com>
29 // Copyright (C) 2020 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp>
30 //
31 // To see a description of the changes please see the Changelog file that
32 // came with your tarball or type make ChangeLog if you are building from git
33 //
34 //========================================================================
35 
36 #ifndef TEXTOUTPUTDEV_H
37 #define TEXTOUTPUTDEV_H
38 
39 #include "poppler-config.h"
40 #include "poppler_private_export.h"
41 #include <cstdio>
42 #include "GfxFont.h"
43 #include "GfxState.h"
44 #include "OutputDev.h"
45 
46 class GooString;
47 class Gfx;
48 class GfxFont;
49 class GfxState;
50 class UnicodeMap;
51 class AnnotLink;
52 
53 class TextWord;
54 class TextPool;
55 class TextLine;
56 class TextLineFrag;
57 class TextBlock;
58 class TextFlow;
59 class TextLink;
60 class TextUnderline;
61 class TextWordList;
62 class TextPage;
63 class TextSelectionVisitor;
64 
65 //------------------------------------------------------------------------
66 
67 typedef void (*TextOutputFunc)(void *stream, const char *text, int len);
68 
69 enum SelectionStyle
70 {
71     selectionStyleGlyph,
72     selectionStyleWord,
73     selectionStyleLine
74 };
75 
76 enum EndOfLineKind
77 {
78     eolUnix, // LF
79     eolDOS, // CR+LF
80     eolMac // CR
81 };
82 
83 //------------------------------------------------------------------------
84 // TextFontInfo
85 //------------------------------------------------------------------------
86 
87 class POPPLER_PRIVATE_EXPORT TextFontInfo
88 {
89 public:
90     explicit TextFontInfo(const GfxState *state);
91     ~TextFontInfo();
92 
93     TextFontInfo(const TextFontInfo &) = delete;
94     TextFontInfo &operator=(const TextFontInfo &) = delete;
95 
96     bool matches(const GfxState *state) const;
97     bool matches(const TextFontInfo *fontInfo) const;
98     bool matches(const Ref *ref) const;
99 
100     // Get the font ascent, or a default value if the font is not set
101     double getAscent() const;
102 
103     // Get the font descent, or a default value if the font is not set
104     double getDescent() const;
105 
106     // Get the writing mode (0 or 1), or 0 if the font is not set
107     int getWMode() const;
108 
109 #ifdef TEXTOUT_WORD_LIST
110     // Get the font name (which may be NULL).
getFontName()111     const GooString *getFontName() const { return fontName; }
112 
113     // Get font descriptor flags.
isFixedWidth()114     bool isFixedWidth() const { return flags & fontFixedWidth; }
isSerif()115     bool isSerif() const { return flags & fontSerif; }
isSymbolic()116     bool isSymbolic() const { return flags & fontSymbolic; }
isItalic()117     bool isItalic() const { return flags & fontItalic; }
isBold()118     bool isBold() const { return flags & fontBold; }
119 #endif
120 
121 private:
122     GfxFont *gfxFont;
123 #ifdef TEXTOUT_WORD_LIST
124     GooString *fontName;
125     int flags;
126 #endif
127 
128     friend class TextWord;
129     friend class TextPage;
130     friend class TextSelectionPainter;
131 };
132 
133 //------------------------------------------------------------------------
134 // TextWord
135 //------------------------------------------------------------------------
136 
137 class POPPLER_PRIVATE_EXPORT TextWord
138 {
139 public:
140     // Constructor.
141     TextWord(const GfxState *state, int rotA, double fontSize);
142 
143     // Destructor.
144     ~TextWord();
145 
146     TextWord(const TextWord &) = delete;
147     TextWord &operator=(const TextWord &) = delete;
148 
149     // Add a character to the word.
150     void addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA);
151 
152     // Attempt to add a character to the word as a combining character.
153     // Either character u or the last character in the word must be an
154     // acute, dieresis, or other combining character.  Returns true if
155     // the character was added.
156     bool addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA);
157 
158     // Merge <word> onto the end of <this>.
159     void merge(TextWord *word);
160 
161     // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>),
162     // based on a primary-axis comparison, e.g., x ordering if rot=0.
163     int primaryCmp(const TextWord *word) const;
164 
165     // Return the distance along the primary axis between <this> and
166     // <word>.
167     double primaryDelta(const TextWord *word) const;
168 
169     static int cmpYX(const void *p1, const void *p2);
170 
171     void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
172 
173     // Get the TextFontInfo object associated with a character.
getFontInfo(int idx)174     const TextFontInfo *getFontInfo(int idx) const { return font[idx]; }
175 
176     // Get the next TextWord on the linked list.
getNext()177     const TextWord *getNext() const { return next; }
178 
179 #ifdef TEXTOUT_WORD_LIST
getLength()180     int getLength() const { return len; }
getChar(int idx)181     const Unicode *getChar(int idx) const { return &text[idx]; }
182     GooString *getText() const;
getFontName(int idx)183     const GooString *getFontName(int idx) const { return font[idx]->fontName; }
getColor(double * r,double * g,double * b)184     void getColor(double *r, double *g, double *b) const
185     {
186         *r = colorR;
187         *g = colorG;
188         *b = colorB;
189     }
getBBox(double * xMinA,double * yMinA,double * xMaxA,double * yMaxA)190     void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const
191     {
192         *xMinA = xMin;
193         *yMinA = yMin;
194         *xMaxA = xMax;
195         *yMaxA = yMax;
196     }
197     void getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const;
getFontSize()198     double getFontSize() const { return fontSize; }
getRotation()199     int getRotation() const { return rot; }
getCharPos()200     int getCharPos() const { return charPos[0]; }
getCharLen()201     int getCharLen() const { return charPos[len] - charPos[0]; }
getSpaceAfter()202     bool getSpaceAfter() const { return spaceAfter; }
203 #endif
isUnderlined()204     bool isUnderlined() const { return underlined; }
getLink()205     const AnnotLink *getLink() const { return link; }
getEdge(int i)206     double getEdge(int i) const { return edge[i]; }
getBaseline()207     double getBaseline() const { return base; }
hasSpaceAfter()208     bool hasSpaceAfter() const { return spaceAfter; }
nextWord()209     const TextWord *nextWord() const { return next; };
210 
211 private:
212     void ensureCapacity(int capacity);
213     void setInitialBounds(TextFontInfo *fontA, double x, double y);
214 
215     int rot; // rotation, multiple of 90 degrees
216              //   (0, 1, 2, or 3)
217     int wMode; // horizontal (0) or vertical (1) writing mode
218     double xMin, xMax; // bounding box x coordinates
219     double yMin, yMax; // bounding box y coordinates
220     double base; // baseline x or y coordinate
221     Unicode *text; // the text
222     CharCode *charcode; // glyph indices
223     double *edge; // "near" edge x or y coord of each char
224                   //   (plus one extra entry for the last char)
225     int *charPos; // character position (within content stream)
226                   //   of each char (plus one extra entry for
227                   //   the last char)
228     int len; // length of text/edge/charPos/font arrays
229     int size; // size of text/edge/charPos/font arrays
230     TextFontInfo **font; // font information for each char
231     Matrix *textMat; // transformation matrix for each char
232     double fontSize; // font size
233     bool spaceAfter; // set if there is a space between this
234                      //   word and the next word on the line
235     bool underlined;
236     bool invisible; // whether we are invisible (glyphless)
237     TextWord *next; // next word in line
238 
239 #ifdef TEXTOUT_WORD_LIST
240     double colorR, // word color
241             colorG, colorB;
242 #endif
243 
244     AnnotLink *link;
245 
246     friend class TextPool;
247     friend class TextLine;
248     friend class TextBlock;
249     friend class TextFlow;
250     friend class TextWordList;
251     friend class TextPage;
252 
253     friend class TextSelectionPainter;
254     friend class TextSelectionDumper;
255 };
256 
257 //------------------------------------------------------------------------
258 // TextPool
259 //------------------------------------------------------------------------
260 
261 class TextPool
262 {
263 public:
264     TextPool();
265     ~TextPool();
266 
267     TextPool(const TextPool &) = delete;
268     TextPool &operator=(const TextPool &) = delete;
269 
getPool(int baseIdx)270     TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; }
setPool(int baseIdx,TextWord * p)271     void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; }
272 
273     int getBaseIdx(double base) const;
274 
275     void addWord(TextWord *word);
276 
277 private:
278     int minBaseIdx; // min baseline bucket index
279     int maxBaseIdx; // max baseline bucket index
280     TextWord **pool; // array of linked lists, one for each
281                      //   baseline value (multiple of 4 pts)
282     TextWord *cursor; // pointer to last-accessed word
283     int cursorBaseIdx; // baseline bucket index of last-accessed word
284 
285     friend class TextBlock;
286     friend class TextPage;
287 };
288 
289 struct TextFlowData;
290 
291 //------------------------------------------------------------------------
292 // TextLine
293 //------------------------------------------------------------------------
294 
295 class TextLine
296 {
297 public:
298     TextLine(TextBlock *blkA, int rotA, double baseA);
299     ~TextLine();
300 
301     TextLine(const TextLine &) = delete;
302     TextLine &operator=(const TextLine &) = delete;
303 
304     void addWord(TextWord *word);
305 
306     // Return the distance along the primary axis between <this> and
307     // <line>.
308     double primaryDelta(const TextLine *line) const;
309 
310     // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
311     // based on a primary-axis comparison, e.g., x ordering if rot=0.
312     int primaryCmp(const TextLine *line) const;
313 
314     // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>),
315     // based on a secondary-axis comparison of the baselines, e.g., y
316     // ordering if rot=0.
317     int secondaryCmp(const TextLine *line) const;
318 
319     int cmpYX(const TextLine *line) const;
320 
321     static int cmpXY(const void *p1, const void *p2);
322 
323     void coalesce(const UnicodeMap *uMap);
324 
325     void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
326 
327     // Get the head of the linked list of TextWords.
getWords()328     const TextWord *getWords() const { return words; }
329 
330     // Get the next TextLine on the linked list.
getNext()331     const TextLine *getNext() const { return next; }
332 
333     // Returns true if the last char of the line is a hyphen.
isHyphenated()334     bool isHyphenated() const { return hyphenated; }
335 
336 private:
337     TextBlock *blk; // parent block
338     int rot; // text rotation
339     double xMin, xMax; // bounding box x coordinates
340     double yMin, yMax; // bounding box y coordinates
341     double base; // baseline x or y coordinate
342     TextWord *words; // words in this line
343     TextWord *lastWord; // last word in this line
344     Unicode *text; // Unicode text of the line, including
345                    //   spaces between words
346     double *edge; // "near" edge x or y coord of each char
347                   //   (plus one extra entry for the last char)
348     int *col; // starting column number of each Unicode char
349     int len; // number of Unicode chars
350     int convertedLen; // total number of converted characters
351     bool hyphenated; // set if last char is a hyphen
352     TextLine *next; // next line in block
353     Unicode *normalized; // normalized form of Unicode text
354     int normalized_len; // number of normalized Unicode chars
355     int *normalized_idx; // indices of normalized chars into Unicode text
356     Unicode *ascii_translation; // ascii translation from the normalized text
357     int ascii_len; // length of ascii translation text
358     int *ascii_idx; // indices of ascii chars into Unicode text of line
359 
360     friend class TextLineFrag;
361     friend class TextBlock;
362     friend class TextFlow;
363     friend class TextWordList;
364     friend class TextPage;
365 
366     friend class TextSelectionPainter;
367     friend class TextSelectionSizer;
368     friend class TextSelectionDumper;
369 };
370 
371 //------------------------------------------------------------------------
372 // TextBlock
373 //------------------------------------------------------------------------
374 
375 class TextBlock
376 {
377 public:
378     TextBlock(TextPage *pageA, int rotA);
379     ~TextBlock();
380 
381     TextBlock(const TextBlock &) = delete;
382     TextBlock &operator=(const TextBlock &) = delete;
383 
384     void addWord(TextWord *word);
385 
386     void coalesce(const UnicodeMap *uMap, double fixedPitch);
387 
388     // Update this block's priMin and priMax values, looking at <blk>.
389     void updatePriMinMax(const TextBlock *blk);
390 
391     static int cmpXYPrimaryRot(const void *p1, const void *p2);
392 
393     static int cmpYXPrimaryRot(const void *p1, const void *p2);
394 
395     int primaryCmp(const TextBlock *blk) const;
396 
397     double secondaryDelta(const TextBlock *blk) const;
398 
399     // Returns true if <this> is below <blk>, relative to the page's
400     // primary rotation.
401     bool isBelow(const TextBlock *blk) const;
402 
403     void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
404 
405     // Get the head of the linked list of TextLines.
getLines()406     const TextLine *getLines() const { return lines; }
407 
408     // Get the next TextBlock on the linked list.
getNext()409     const TextBlock *getNext() const { return next; }
410 
getBBox(double * xMinA,double * yMinA,double * xMaxA,double * yMaxA)411     void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const
412     {
413         *xMinA = xMin;
414         *yMinA = yMin;
415         *xMaxA = xMax;
416         *yMaxA = yMax;
417     }
418 
getLineCount()419     int getLineCount() const { return nLines; }
420 
421 private:
422     bool isBeforeByRule1(const TextBlock *blk1);
423     bool isBeforeByRepeatedRule1(const TextBlock *blkList, const TextBlock *blk1);
424     bool isBeforeByRule2(const TextBlock *blk1);
425 
426     int visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited);
427     int visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited, TextBlock **cache, int cacheSize);
428 
429     TextPage *page; // the parent page
430     int rot; // text rotation
431     double xMin, xMax; // bounding box x coordinates
432     double yMin, yMax; // bounding box y coordinates
433     double priMin, priMax; // whitespace bounding box along primary axis
434     double ExMin, ExMax; // extended bounding box x coordinates
435     double EyMin, EyMax; // extended bounding box y coordinates
436     int tableId; // id of table to which this block belongs
437     bool tableEnd; // is this block at end of line of actual table
438 
439     TextPool *pool; // pool of words (used only until lines
440                     //   are built)
441     TextLine *lines; // linked list of lines
442     TextLine *curLine; // most recently added line
443     int nLines; // number of lines
444     int charCount; // number of characters in the block
445     int col; // starting column
446     int nColumns; // number of columns in the block
447 
448     TextBlock *next;
449     TextBlock *stackNext;
450 
451     friend class TextLine;
452     friend class TextLineFrag;
453     friend class TextFlow;
454     friend class TextWordList;
455     friend class TextPage;
456     friend class TextSelectionPainter;
457     friend class TextSelectionDumper;
458 };
459 
460 //------------------------------------------------------------------------
461 // TextFlow
462 //------------------------------------------------------------------------
463 
464 class TextFlow
465 {
466 public:
467     TextFlow(TextPage *pageA, TextBlock *blk);
468     ~TextFlow();
469 
470     TextFlow(const TextFlow &) = delete;
471     TextFlow &operator=(const TextFlow &) = delete;
472 
473     // Add a block to the end of this flow.
474     void addBlock(TextBlock *blk);
475 
476     // Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1)
477     // it uses a font no larger than the last block added to the flow,
478     // and (2) it fits within the flow's [priMin, priMax] along the
479     // primary axis.
480     bool blockFits(const TextBlock *blk, const TextBlock *prevBlk) const;
481 
482     // Get the head of the linked list of TextBlocks.
getBlocks()483     const TextBlock *getBlocks() const { return blocks; }
484 
485     // Get the next TextFlow on the linked list.
getNext()486     const TextFlow *getNext() const { return next; }
487 
488 private:
489     TextPage *page; // the parent page
490     double xMin, xMax; // bounding box x coordinates
491     double yMin, yMax; // bounding box y coordinates
492     double priMin, priMax; // whitespace bounding box along primary axis
493     TextBlock *blocks; // blocks in flow
494     TextBlock *lastBlk; // last block in this flow
495     TextFlow *next;
496 
497     friend class TextWordList;
498     friend class TextPage;
499 };
500 
501 #ifdef TEXTOUT_WORD_LIST
502 
503 //------------------------------------------------------------------------
504 // TextWordList
505 //------------------------------------------------------------------------
506 
507 class POPPLER_PRIVATE_EXPORT TextWordList
508 {
509 public:
510     // Build a flat word list, in content stream order (if
511     // text->rawOrder is true), physical layout order (if <physLayout>
512     // is true and text->rawOrder is false), or reading order (if both
513     // flags are false).
514     TextWordList(const TextPage *text, bool physLayout);
515 
516     ~TextWordList();
517 
518     TextWordList(const TextWordList &) = delete;
519     TextWordList &operator=(const TextWordList &) = delete;
520 
521     // Return the number of words on the list.
522     int getLength() const;
523 
524     // Return the <idx>th word from the list.
525     TextWord *get(int idx);
526 
527 private:
528     std::vector<TextWord *> words;
529 };
530 
531 #endif // TEXTOUT_WORD_LIST
532 
533 class TextWordSelection
534 {
535 public:
TextWordSelection(const TextWord * wordA,int beginA,int endA)536     TextWordSelection(const TextWord *wordA, int beginA, int endA) : word(wordA), begin(beginA), end(endA) { }
537 
getWord()538     const TextWord *getWord() const { return word; }
getBegin()539     int getBegin() const { return begin; }
getEnd()540     int getEnd() const { return end; }
541 
542 private:
543     const TextWord *word;
544     int begin;
545     int end;
546 
547     friend class TextSelectionPainter;
548     friend class TextSelectionDumper;
549 };
550 
551 //------------------------------------------------------------------------
552 // TextPage
553 //------------------------------------------------------------------------
554 
555 class POPPLER_PRIVATE_EXPORT TextPage
556 {
557 public:
558     // Constructor.
559     explicit TextPage(bool rawOrderA, bool discardDiagA = false);
560 
561     TextPage(const TextPage &) = delete;
562     TextPage &operator=(const TextPage &) = delete;
563 
564     void incRefCnt();
565     void decRefCnt();
566 
567     // Start a new page.
568     void startPage(const GfxState *state);
569 
570     // End the current page.
571     void endPage();
572 
573     // Update the current font.
574     void updateFont(const GfxState *state);
575 
576     // Begin a new word.
577     void beginWord(const GfxState *state);
578 
579     // Add a character to the current word.
580     void addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen);
581 
582     // Add <nChars> invisible characters.
583     void incCharCount(int nChars);
584 
585     // End the current word, sorting it into the list of words.
586     void endWord();
587 
588     // Add a word, sorting it into the list of words.
589     void addWord(TextWord *word);
590 
591     // Add a (potential) underline.
592     void addUnderline(double x0, double y0, double x1, double y1);
593 
594     // Add a hyperlink.
595     void addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link);
596 
597     // Coalesce strings that look like parts of the same line.
598     void coalesce(bool physLayout, double fixedPitch, bool doHTML);
599 
600     // Find a string.  If <startAtTop> is true, starts looking at the
601     // top of the page; else if <startAtLast> is true, starts looking
602     // immediately after the last find result; else starts looking at
603     // <xMin>,<yMin>.  If <stopAtBottom> is true, stops looking at the
604     // bottom of the page; else if <stopAtLast> is true, stops looking
605     // just before the last find result; else stops looking at
606     // <xMax>,<yMax>.
607     bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax);
608 
609     // Adds new parameter ignoreDiacritics, which will do diacritics
610     // insensitive search, i.e. ignore accents, umlauts, diaeresis,etc.
611     // while matching. This option will be ignored if <s> contains characters
612     // which are not pure ascii.
613     bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax,
614                   double *yMax);
615 
616     // Adds new parameter <matchAcrossLines>, which allows <s> to match on text
617     // spanning from end of a line to the next line. In that case, the rect for
618     // the part of match that falls on the next line will be stored in
619     // <continueMatch>, and if hyphenation (i.e. ignoring hyphen at end of line)
620     // was used while matching at the end of the line prior to <continueMatch>,
621     // then <ignoredHyphen> will be true, otherwise will be false.
622     // Only finding across two lines is supported, i.e. it won't match where <s>
623     // spans more than two lines.
624     //
625     // <matchAcrossLines> will be ignored if <backward> is true (as that
626     // combination has not been implemented yet).
627     bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin, double *yMin,
628                   double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen);
629 
630     // Get the text which is inside the specified rectangle.
631     GooString *getText(double xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const;
632 
633     void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style);
634 
635     void drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color);
636 
637     std::vector<PDFRectangle *> *getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale);
638 
639     GooString *getSelectionText(const PDFRectangle *selection, SelectionStyle style);
640 
641     std::vector<TextWordSelection *> **getSelectionWords(const PDFRectangle *selection, SelectionStyle style, int *nLines);
642 
643     // Find a string by character position and length.  If found, sets
644     // the text bounding rectangle and returns true; otherwise returns
645     // false.
646     bool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const;
647 
648     // Dump contents of page to a file.
649     void dump(void *outputStream, TextOutputFunc outputFunc, bool physLayout, EndOfLineKind textEOL, bool pageBreaks);
650 
651     // Get the head of the linked list of TextFlows.
getFlows()652     const TextFlow *getFlows() const { return flows; }
653 
654     // If true, will combine characters when a base and combining
655     // character are drawn on eachother.
656     void setMergeCombining(bool merge);
657 
658 #ifdef TEXTOUT_WORD_LIST
659     // Build a flat word list, in content stream order (if
660     // this->rawOrder is true), physical layout order (if <physLayout>
661     // is true and this->rawOrder is false), or reading order (if both
662     // flags are false).
663     std::unique_ptr<TextWordList> makeWordList(bool physLayout);
664 #endif
665 
666 private:
667     // Destructor.
668     ~TextPage();
669 
670     void clear();
671     void assignColumns(TextLineFrag *frags, int nFrags, bool rot) const;
672     int dumpFragment(const Unicode *text, int len, const UnicodeMap *uMap, GooString *s) const;
673     void adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax);
674 
675     bool rawOrder; // keep text in content stream order
676     bool discardDiag; // discard diagonal text
677     bool mergeCombining; // merge when combining and base characters
678                          // are drawn on top of each other
679 
680     double pageWidth, pageHeight; // width and height of current page
681     TextWord *curWord; // currently active string
682     int charPos; // next character position (within content
683                  //   stream)
684     TextFontInfo *curFont; // current font
685     double curFontSize; // current font size
686     int nest; // current nesting level (for Type 3 fonts)
687     int nTinyChars; // number of "tiny" chars seen so far
688     bool lastCharOverlap; // set if the last added char overlapped the
689                           //   previous char
690     bool diagonal; // whether the current text is diagonal
691 
692     std::unique_ptr<TextPool> pools[4]; // a "pool" of TextWords for each rotation
693     TextFlow *flows; // linked list of flows
694     TextBlock **blocks; // array of blocks, in yx order
695     int nBlocks; // number of blocks
696     int primaryRot; // primary rotation
697     bool primaryLR; // primary direction (true means L-to-R,
698                     //   false means R-to-L)
699     TextWord *rawWords; // list of words, in raw order (only if
700                         //   rawOrder is set)
701     TextWord *rawLastWord; // last word on rawWords list
702 
703     std::vector<std::unique_ptr<TextFontInfo>> fonts; // all font info objects used on this page
704 
705     double lastFindXMin, // coordinates of the last "find" result
706             lastFindYMin;
707     bool haveLastFind;
708 
709     std::vector<std::unique_ptr<TextUnderline>> underlines;
710     std::vector<std::unique_ptr<TextLink>> links;
711 
712     int refCnt;
713 
714     friend class TextLine;
715     friend class TextLineFrag;
716     friend class TextBlock;
717     friend class TextFlow;
718     friend class TextWordList;
719     friend class TextSelectionPainter;
720     friend class TextSelectionDumper;
721 };
722 
723 //------------------------------------------------------------------------
724 // ActualText
725 //------------------------------------------------------------------------
726 
727 class POPPLER_PRIVATE_EXPORT ActualText
728 {
729 public:
730     // Create an ActualText
731     explicit ActualText(TextPage *out);
732     ~ActualText();
733 
734     ActualText(const ActualText &) = delete;
735     ActualText &operator=(const ActualText &) = delete;
736 
737     void addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen);
738     void begin(const GfxState *state, const GooString *text);
739     void end(const GfxState *state);
740 
741 private:
742     TextPage *text;
743 
744     GooString *actualText; // replacement text for the span
745     double actualTextX0;
746     double actualTextY0;
747     double actualTextX1;
748     double actualTextY1;
749     int actualTextNBytes;
750 };
751 
752 //------------------------------------------------------------------------
753 // TextOutputDev
754 //------------------------------------------------------------------------
755 
756 class POPPLER_PRIVATE_EXPORT TextOutputDev : public OutputDev
757 {
758 public:
759     // Open a text output file.  If <fileName> is NULL, no file is
760     // written (this is useful, e.g., for searching text).  If
761     // <physLayoutA> is true, the original physical layout of the text
762     // is maintained.  If <rawOrder> is true, the text is kept in
763     // content stream order.  If <discardDiag> is true, diagonal text
764     // is removed from output.
765     TextOutputDev(const char *fileName, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool append, bool discardDiagA = false);
766 
767     // Create a TextOutputDev which will write to a generic stream.  If
768     // <physLayoutA> is true, the original physical layout of the text
769     // is maintained.  If <rawOrder> is true, the text is kept in
770     // content stream order.  If <discardDiag> is true, diagonal text
771     // is removed from output.
772     TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool discardDiagA = false);
773 
774     // Destructor.
775     ~TextOutputDev() override;
776 
777     // Check if file was successfully created.
isOk()778     virtual bool isOk() { return ok; }
779 
780     //---- get info about output device
781 
782     // Does this device use upside-down coordinates?
783     // (Upside-down means (0,0) is the top left corner of the page.)
upsideDown()784     bool upsideDown() override { return true; }
785 
786     // Does this device use drawChar() or drawString()?
useDrawChar()787     bool useDrawChar() override { return true; }
788 
789     // Does this device use beginType3Char/endType3Char?  Otherwise,
790     // text in Type 3 fonts will be drawn with drawChar/drawString.
interpretType3Chars()791     bool interpretType3Chars() override { return false; }
792 
793     // Does this device need non-text content?
needNonText()794     bool needNonText() override { return false; }
795 
796     // Does this device require incCharCount to be called for text on
797     // non-shown layers?
needCharCount()798     bool needCharCount() override { return true; }
799 
800     //----- initialization and control
801 
802     // Start a page.
803     void startPage(int pageNum, GfxState *state, XRef *xref) override;
804 
805     // End a page.
806     void endPage() override;
807 
808     //----- save/restore graphics state
809     void restoreState(GfxState *state) override;
810 
811     //----- update text state
812     void updateFont(GfxState *state) override;
813 
814     //----- text drawing
815     void beginString(GfxState *state, const GooString *s) override;
816     void endString(GfxState *state) override;
817     void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, const Unicode *u, int uLen) override;
818     void incCharCount(int nChars) override;
819     void beginActualText(GfxState *state, const GooString *text) override;
820     void endActualText(GfxState *state) override;
821 
822     //----- path painting
823     void stroke(GfxState *state) override;
824     void fill(GfxState *state) override;
825     void eoFill(GfxState *state) override;
826 
827     //----- link borders
828     void processLink(AnnotLink *link) override;
829 
830     //----- special access
831 
832     // Find a string.  If <startAtTop> is true, starts looking at the
833     // top of the page; else if <startAtLast> is true, starts looking
834     // immediately after the last find result; else starts looking at
835     // <xMin>,<yMin>.  If <stopAtBottom> is true, stops looking at the
836     // bottom of the page; else if <stopAtLast> is true, stops looking
837     // just before the last find result; else stops looking at
838     // <xMax>,<yMax>.
839     bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) const;
840 
841     // Get the text which is inside the specified rectangle.
842     GooString *getText(double xMin, double yMin, double xMax, double yMax) const;
843 
844     // Find a string by character position and length.  If found, sets
845     // the text bounding rectangle and returns true; otherwise returns
846     // false.
847     bool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const;
848 
849     void drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color);
850 
851     std::vector<PDFRectangle *> *getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale);
852 
853     GooString *getSelectionText(const PDFRectangle *selection, SelectionStyle style);
854 
855     // If true, will combine characters when a base and combining
856     // character are drawn on eachother.
857     void setMergeCombining(bool merge);
858 
859 #ifdef TEXTOUT_WORD_LIST
860     // Build a flat word list, in content stream order (if
861     // this->rawOrder is true), physical layout order (if
862     // this->physLayout is true and this->rawOrder is false), or reading
863     // order (if both flags are false).
864     std::unique_ptr<TextWordList> makeWordList();
865 #endif
866 
867     // Returns the TextPage object for the last rasterized page,
868     // transferring ownership to the caller.
869     TextPage *takeText();
870 
871     // Turn extra processing for HTML conversion on or off.
enableHTMLExtras(bool doHTMLA)872     void enableHTMLExtras(bool doHTMLA) { doHTML = doHTMLA; }
873 
874     // Get the head of the linked list of TextFlows for the
875     // last rasterized page.
876     const TextFlow *getFlows() const;
877 
defaultEndOfLine()878     static constexpr EndOfLineKind defaultEndOfLine()
879     {
880 #if defined(_WIN32)
881         return eolDOS;
882 #else
883         return eolUnix;
884 #endif
885     }
setTextEOL(EndOfLineKind textEOLA)886     void setTextEOL(EndOfLineKind textEOLA) { textEOL = textEOLA; }
setTextPageBreaks(bool textPageBreaksA)887     void setTextPageBreaks(bool textPageBreaksA) { textPageBreaks = textPageBreaksA; }
888 
889 private:
890     TextOutputFunc outputFunc; // output function
891     void *outputStream; // output stream
892     bool needClose; // need to close the output file?
893                     //   (only if outputStream is a FILE*)
894     TextPage *text; // text for the current page
895     bool physLayout; // maintain original physical layout when
896                      //   dumping text
897     double fixedPitch; // if physLayout is true and this is non-zero,
898                        //   assume fixed-pitch characters with this
899                        //   width
900     bool rawOrder; // keep text in content stream order
901     bool discardDiag; // Diagonal text, i.e., text that is not close to one of the
902                       // 0, 90, 180, or 270 degree axes, is discarded. This is useful
903                       // to skip watermarks drawn on top of body text, etc.
904     bool doHTML; // extra processing for HTML conversion
905     bool ok; // set up ok?
906     bool textPageBreaks; // insert end-of-page markers?
907     EndOfLineKind textEOL; // type of EOL marker to use
908 
909     ActualText *actualText;
910 };
911 
912 #endif
913