1 //======================================================================== 2 // 3 // TextOutputDev.h 4 // 5 // Copyright 1997-2003 Glyph & Cog, LLC 6 // 7 //======================================================================== 8 9 //======================================================================== 10 // 11 // Modified under the Poppler project - http://poppler.freedesktop.org 12 // 13 // All changes made under the Poppler project to this file are licensed 14 // under GPL version 2 or later 15 // 16 // Copyright (C) 2005-2007 Kristian Høgsberg <krh@redhat.com> 17 // Copyright (C) 2006 Ed Catmur <ed@catmur.co.uk> 18 // Copyright (C) 2007, 2008, 2011, 2013 Carlos Garcia Campos <carlosgc@gnome.org> 19 // Copyright (C) 2007, 2017 Adrian Johnson <ajohnson@redneon.com> 20 // Copyright (C) 2008, 2010, 2015, 2016, 2018, 2019, 2021 Albert Astals Cid <aacid@kde.org> 21 // Copyright (C) 2010 Brian Ewins <brian.ewins@gmail.com> 22 // Copyright (C) 2012, 2013, 2015, 2016 Jason Crain <jason@aquaticape.us> 23 // Copyright (C) 2013 Thomas Freitag <Thomas.Freitag@alfa.de> 24 // Copyright (C) 2018 Klarälvdalens Datakonsult AB, a KDAB Group company, <info@kdab.com>. Work sponsored by the LiMux project of the city of Munich 25 // Copyright (C) 2018 Sanchit Anand <sanxchit@gmail.com> 26 // Copyright (C) 2018, 2020, 2021 Nelson Benítez León <nbenitezl@gmail.com> 27 // Copyright (C) 2019 Oliver Sander <oliver.sander@tu-dresden.de> 28 // Copyright (C) 2019 Dan Shea <dan.shea@logical-innovations.com> 29 // Copyright (C) 2020 Suzuki Toshiya <mpsuzuki@hiroshima-u.ac.jp> 30 // 31 // To see a description of the changes please see the Changelog file that 32 // came with your tarball or type make ChangeLog if you are building from git 33 // 34 //======================================================================== 35 36 #ifndef TEXTOUTPUTDEV_H 37 #define TEXTOUTPUTDEV_H 38 39 #include "poppler-config.h" 40 #include "poppler_private_export.h" 41 #include <cstdio> 42 #include "GfxFont.h" 43 #include "GfxState.h" 44 #include "OutputDev.h" 45 46 class GooString; 47 class Gfx; 48 class GfxFont; 49 class GfxState; 50 class UnicodeMap; 51 class AnnotLink; 52 53 class TextWord; 54 class TextPool; 55 class TextLine; 56 class TextLineFrag; 57 class TextBlock; 58 class TextFlow; 59 class TextLink; 60 class TextUnderline; 61 class TextWordList; 62 class TextPage; 63 class TextSelectionVisitor; 64 65 //------------------------------------------------------------------------ 66 67 typedef void (*TextOutputFunc)(void *stream, const char *text, int len); 68 69 enum SelectionStyle 70 { 71 selectionStyleGlyph, 72 selectionStyleWord, 73 selectionStyleLine 74 }; 75 76 enum EndOfLineKind 77 { 78 eolUnix, // LF 79 eolDOS, // CR+LF 80 eolMac // CR 81 }; 82 83 //------------------------------------------------------------------------ 84 // TextFontInfo 85 //------------------------------------------------------------------------ 86 87 class POPPLER_PRIVATE_EXPORT TextFontInfo 88 { 89 public: 90 explicit TextFontInfo(const GfxState *state); 91 ~TextFontInfo(); 92 93 TextFontInfo(const TextFontInfo &) = delete; 94 TextFontInfo &operator=(const TextFontInfo &) = delete; 95 96 bool matches(const GfxState *state) const; 97 bool matches(const TextFontInfo *fontInfo) const; 98 bool matches(const Ref *ref) const; 99 100 // Get the font ascent, or a default value if the font is not set 101 double getAscent() const; 102 103 // Get the font descent, or a default value if the font is not set 104 double getDescent() const; 105 106 // Get the writing mode (0 or 1), or 0 if the font is not set 107 int getWMode() const; 108 109 #ifdef TEXTOUT_WORD_LIST 110 // Get the font name (which may be NULL). getFontName()111 const GooString *getFontName() const { return fontName; } 112 113 // Get font descriptor flags. isFixedWidth()114 bool isFixedWidth() const { return flags & fontFixedWidth; } isSerif()115 bool isSerif() const { return flags & fontSerif; } isSymbolic()116 bool isSymbolic() const { return flags & fontSymbolic; } isItalic()117 bool isItalic() const { return flags & fontItalic; } isBold()118 bool isBold() const { return flags & fontBold; } 119 #endif 120 121 private: 122 GfxFont *gfxFont; 123 #ifdef TEXTOUT_WORD_LIST 124 GooString *fontName; 125 int flags; 126 #endif 127 128 friend class TextWord; 129 friend class TextPage; 130 friend class TextSelectionPainter; 131 }; 132 133 //------------------------------------------------------------------------ 134 // TextWord 135 //------------------------------------------------------------------------ 136 137 class POPPLER_PRIVATE_EXPORT TextWord 138 { 139 public: 140 // Constructor. 141 TextWord(const GfxState *state, int rotA, double fontSize); 142 143 // Destructor. 144 ~TextWord(); 145 146 TextWord(const TextWord &) = delete; 147 TextWord &operator=(const TextWord &) = delete; 148 149 // Add a character to the word. 150 void addChar(const GfxState *state, TextFontInfo *fontA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA); 151 152 // Attempt to add a character to the word as a combining character. 153 // Either character u or the last character in the word must be an 154 // acute, dieresis, or other combining character. Returns true if 155 // the character was added. 156 bool addCombining(const GfxState *state, TextFontInfo *fontA, double fontSizeA, double x, double y, double dx, double dy, int charPosA, int charLen, CharCode c, Unicode u, const Matrix &textMatA); 157 158 // Merge <word> onto the end of <this>. 159 void merge(TextWord *word); 160 161 // Compares <this> to <word>, returning -1 (<), 0 (=), or +1 (>), 162 // based on a primary-axis comparison, e.g., x ordering if rot=0. 163 int primaryCmp(const TextWord *word) const; 164 165 // Return the distance along the primary axis between <this> and 166 // <word>. 167 double primaryDelta(const TextWord *word) const; 168 169 static int cmpYX(const void *p1, const void *p2); 170 171 void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); 172 173 // Get the TextFontInfo object associated with a character. getFontInfo(int idx)174 const TextFontInfo *getFontInfo(int idx) const { return font[idx]; } 175 176 // Get the next TextWord on the linked list. getNext()177 const TextWord *getNext() const { return next; } 178 179 #ifdef TEXTOUT_WORD_LIST getLength()180 int getLength() const { return len; } getChar(int idx)181 const Unicode *getChar(int idx) const { return &text[idx]; } 182 GooString *getText() const; getFontName(int idx)183 const GooString *getFontName(int idx) const { return font[idx]->fontName; } getColor(double * r,double * g,double * b)184 void getColor(double *r, double *g, double *b) const 185 { 186 *r = colorR; 187 *g = colorG; 188 *b = colorB; 189 } getBBox(double * xMinA,double * yMinA,double * xMaxA,double * yMaxA)190 void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const 191 { 192 *xMinA = xMin; 193 *yMinA = yMin; 194 *xMaxA = xMax; 195 *yMaxA = yMax; 196 } 197 void getCharBBox(int charIdx, double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const; getFontSize()198 double getFontSize() const { return fontSize; } getRotation()199 int getRotation() const { return rot; } getCharPos()200 int getCharPos() const { return charPos[0]; } getCharLen()201 int getCharLen() const { return charPos[len] - charPos[0]; } getSpaceAfter()202 bool getSpaceAfter() const { return spaceAfter; } 203 #endif isUnderlined()204 bool isUnderlined() const { return underlined; } getLink()205 const AnnotLink *getLink() const { return link; } getEdge(int i)206 double getEdge(int i) const { return edge[i]; } getBaseline()207 double getBaseline() const { return base; } hasSpaceAfter()208 bool hasSpaceAfter() const { return spaceAfter; } nextWord()209 const TextWord *nextWord() const { return next; }; 210 211 private: 212 void ensureCapacity(int capacity); 213 void setInitialBounds(TextFontInfo *fontA, double x, double y); 214 215 int rot; // rotation, multiple of 90 degrees 216 // (0, 1, 2, or 3) 217 int wMode; // horizontal (0) or vertical (1) writing mode 218 double xMin, xMax; // bounding box x coordinates 219 double yMin, yMax; // bounding box y coordinates 220 double base; // baseline x or y coordinate 221 Unicode *text; // the text 222 CharCode *charcode; // glyph indices 223 double *edge; // "near" edge x or y coord of each char 224 // (plus one extra entry for the last char) 225 int *charPos; // character position (within content stream) 226 // of each char (plus one extra entry for 227 // the last char) 228 int len; // length of text/edge/charPos/font arrays 229 int size; // size of text/edge/charPos/font arrays 230 TextFontInfo **font; // font information for each char 231 Matrix *textMat; // transformation matrix for each char 232 double fontSize; // font size 233 bool spaceAfter; // set if there is a space between this 234 // word and the next word on the line 235 bool underlined; 236 bool invisible; // whether we are invisible (glyphless) 237 TextWord *next; // next word in line 238 239 #ifdef TEXTOUT_WORD_LIST 240 double colorR, // word color 241 colorG, colorB; 242 #endif 243 244 AnnotLink *link; 245 246 friend class TextPool; 247 friend class TextLine; 248 friend class TextBlock; 249 friend class TextFlow; 250 friend class TextWordList; 251 friend class TextPage; 252 253 friend class TextSelectionPainter; 254 friend class TextSelectionDumper; 255 }; 256 257 //------------------------------------------------------------------------ 258 // TextPool 259 //------------------------------------------------------------------------ 260 261 class TextPool 262 { 263 public: 264 TextPool(); 265 ~TextPool(); 266 267 TextPool(const TextPool &) = delete; 268 TextPool &operator=(const TextPool &) = delete; 269 getPool(int baseIdx)270 TextWord *getPool(int baseIdx) { return pool[baseIdx - minBaseIdx]; } setPool(int baseIdx,TextWord * p)271 void setPool(int baseIdx, TextWord *p) { pool[baseIdx - minBaseIdx] = p; } 272 273 int getBaseIdx(double base) const; 274 275 void addWord(TextWord *word); 276 277 private: 278 int minBaseIdx; // min baseline bucket index 279 int maxBaseIdx; // max baseline bucket index 280 TextWord **pool; // array of linked lists, one for each 281 // baseline value (multiple of 4 pts) 282 TextWord *cursor; // pointer to last-accessed word 283 int cursorBaseIdx; // baseline bucket index of last-accessed word 284 285 friend class TextBlock; 286 friend class TextPage; 287 }; 288 289 struct TextFlowData; 290 291 //------------------------------------------------------------------------ 292 // TextLine 293 //------------------------------------------------------------------------ 294 295 class TextLine 296 { 297 public: 298 TextLine(TextBlock *blkA, int rotA, double baseA); 299 ~TextLine(); 300 301 TextLine(const TextLine &) = delete; 302 TextLine &operator=(const TextLine &) = delete; 303 304 void addWord(TextWord *word); 305 306 // Return the distance along the primary axis between <this> and 307 // <line>. 308 double primaryDelta(const TextLine *line) const; 309 310 // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>), 311 // based on a primary-axis comparison, e.g., x ordering if rot=0. 312 int primaryCmp(const TextLine *line) const; 313 314 // Compares <this> to <line>, returning -1 (<), 0 (=), or +1 (>), 315 // based on a secondary-axis comparison of the baselines, e.g., y 316 // ordering if rot=0. 317 int secondaryCmp(const TextLine *line) const; 318 319 int cmpYX(const TextLine *line) const; 320 321 static int cmpXY(const void *p1, const void *p2); 322 323 void coalesce(const UnicodeMap *uMap); 324 325 void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); 326 327 // Get the head of the linked list of TextWords. getWords()328 const TextWord *getWords() const { return words; } 329 330 // Get the next TextLine on the linked list. getNext()331 const TextLine *getNext() const { return next; } 332 333 // Returns true if the last char of the line is a hyphen. isHyphenated()334 bool isHyphenated() const { return hyphenated; } 335 336 private: 337 TextBlock *blk; // parent block 338 int rot; // text rotation 339 double xMin, xMax; // bounding box x coordinates 340 double yMin, yMax; // bounding box y coordinates 341 double base; // baseline x or y coordinate 342 TextWord *words; // words in this line 343 TextWord *lastWord; // last word in this line 344 Unicode *text; // Unicode text of the line, including 345 // spaces between words 346 double *edge; // "near" edge x or y coord of each char 347 // (plus one extra entry for the last char) 348 int *col; // starting column number of each Unicode char 349 int len; // number of Unicode chars 350 int convertedLen; // total number of converted characters 351 bool hyphenated; // set if last char is a hyphen 352 TextLine *next; // next line in block 353 Unicode *normalized; // normalized form of Unicode text 354 int normalized_len; // number of normalized Unicode chars 355 int *normalized_idx; // indices of normalized chars into Unicode text 356 Unicode *ascii_translation; // ascii translation from the normalized text 357 int ascii_len; // length of ascii translation text 358 int *ascii_idx; // indices of ascii chars into Unicode text of line 359 360 friend class TextLineFrag; 361 friend class TextBlock; 362 friend class TextFlow; 363 friend class TextWordList; 364 friend class TextPage; 365 366 friend class TextSelectionPainter; 367 friend class TextSelectionSizer; 368 friend class TextSelectionDumper; 369 }; 370 371 //------------------------------------------------------------------------ 372 // TextBlock 373 //------------------------------------------------------------------------ 374 375 class TextBlock 376 { 377 public: 378 TextBlock(TextPage *pageA, int rotA); 379 ~TextBlock(); 380 381 TextBlock(const TextBlock &) = delete; 382 TextBlock &operator=(const TextBlock &) = delete; 383 384 void addWord(TextWord *word); 385 386 void coalesce(const UnicodeMap *uMap, double fixedPitch); 387 388 // Update this block's priMin and priMax values, looking at <blk>. 389 void updatePriMinMax(const TextBlock *blk); 390 391 static int cmpXYPrimaryRot(const void *p1, const void *p2); 392 393 static int cmpYXPrimaryRot(const void *p1, const void *p2); 394 395 int primaryCmp(const TextBlock *blk) const; 396 397 double secondaryDelta(const TextBlock *blk) const; 398 399 // Returns true if <this> is below <blk>, relative to the page's 400 // primary rotation. 401 bool isBelow(const TextBlock *blk) const; 402 403 void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); 404 405 // Get the head of the linked list of TextLines. getLines()406 const TextLine *getLines() const { return lines; } 407 408 // Get the next TextBlock on the linked list. getNext()409 const TextBlock *getNext() const { return next; } 410 getBBox(double * xMinA,double * yMinA,double * xMaxA,double * yMaxA)411 void getBBox(double *xMinA, double *yMinA, double *xMaxA, double *yMaxA) const 412 { 413 *xMinA = xMin; 414 *yMinA = yMin; 415 *xMaxA = xMax; 416 *yMaxA = yMax; 417 } 418 getLineCount()419 int getLineCount() const { return nLines; } 420 421 private: 422 bool isBeforeByRule1(const TextBlock *blk1); 423 bool isBeforeByRepeatedRule1(const TextBlock *blkList, const TextBlock *blk1); 424 bool isBeforeByRule2(const TextBlock *blk1); 425 426 int visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited); 427 int visitDepthFirst(TextBlock *blkList, int pos1, TextBlock **sorted, int sortPos, bool *visited, TextBlock **cache, int cacheSize); 428 429 TextPage *page; // the parent page 430 int rot; // text rotation 431 double xMin, xMax; // bounding box x coordinates 432 double yMin, yMax; // bounding box y coordinates 433 double priMin, priMax; // whitespace bounding box along primary axis 434 double ExMin, ExMax; // extended bounding box x coordinates 435 double EyMin, EyMax; // extended bounding box y coordinates 436 int tableId; // id of table to which this block belongs 437 bool tableEnd; // is this block at end of line of actual table 438 439 TextPool *pool; // pool of words (used only until lines 440 // are built) 441 TextLine *lines; // linked list of lines 442 TextLine *curLine; // most recently added line 443 int nLines; // number of lines 444 int charCount; // number of characters in the block 445 int col; // starting column 446 int nColumns; // number of columns in the block 447 448 TextBlock *next; 449 TextBlock *stackNext; 450 451 friend class TextLine; 452 friend class TextLineFrag; 453 friend class TextFlow; 454 friend class TextWordList; 455 friend class TextPage; 456 friend class TextSelectionPainter; 457 friend class TextSelectionDumper; 458 }; 459 460 //------------------------------------------------------------------------ 461 // TextFlow 462 //------------------------------------------------------------------------ 463 464 class TextFlow 465 { 466 public: 467 TextFlow(TextPage *pageA, TextBlock *blk); 468 ~TextFlow(); 469 470 TextFlow(const TextFlow &) = delete; 471 TextFlow &operator=(const TextFlow &) = delete; 472 473 // Add a block to the end of this flow. 474 void addBlock(TextBlock *blk); 475 476 // Returns true if <blk> fits below <prevBlk> in the flow, i.e., (1) 477 // it uses a font no larger than the last block added to the flow, 478 // and (2) it fits within the flow's [priMin, priMax] along the 479 // primary axis. 480 bool blockFits(const TextBlock *blk, const TextBlock *prevBlk) const; 481 482 // Get the head of the linked list of TextBlocks. getBlocks()483 const TextBlock *getBlocks() const { return blocks; } 484 485 // Get the next TextFlow on the linked list. getNext()486 const TextFlow *getNext() const { return next; } 487 488 private: 489 TextPage *page; // the parent page 490 double xMin, xMax; // bounding box x coordinates 491 double yMin, yMax; // bounding box y coordinates 492 double priMin, priMax; // whitespace bounding box along primary axis 493 TextBlock *blocks; // blocks in flow 494 TextBlock *lastBlk; // last block in this flow 495 TextFlow *next; 496 497 friend class TextWordList; 498 friend class TextPage; 499 }; 500 501 #ifdef TEXTOUT_WORD_LIST 502 503 //------------------------------------------------------------------------ 504 // TextWordList 505 //------------------------------------------------------------------------ 506 507 class POPPLER_PRIVATE_EXPORT TextWordList 508 { 509 public: 510 // Build a flat word list, in content stream order (if 511 // text->rawOrder is true), physical layout order (if <physLayout> 512 // is true and text->rawOrder is false), or reading order (if both 513 // flags are false). 514 TextWordList(const TextPage *text, bool physLayout); 515 516 ~TextWordList(); 517 518 TextWordList(const TextWordList &) = delete; 519 TextWordList &operator=(const TextWordList &) = delete; 520 521 // Return the number of words on the list. 522 int getLength() const; 523 524 // Return the <idx>th word from the list. 525 TextWord *get(int idx); 526 527 private: 528 std::vector<TextWord *> words; 529 }; 530 531 #endif // TEXTOUT_WORD_LIST 532 533 class TextWordSelection 534 { 535 public: TextWordSelection(const TextWord * wordA,int beginA,int endA)536 TextWordSelection(const TextWord *wordA, int beginA, int endA) : word(wordA), begin(beginA), end(endA) { } 537 getWord()538 const TextWord *getWord() const { return word; } getBegin()539 int getBegin() const { return begin; } getEnd()540 int getEnd() const { return end; } 541 542 private: 543 const TextWord *word; 544 int begin; 545 int end; 546 547 friend class TextSelectionPainter; 548 friend class TextSelectionDumper; 549 }; 550 551 //------------------------------------------------------------------------ 552 // TextPage 553 //------------------------------------------------------------------------ 554 555 class POPPLER_PRIVATE_EXPORT TextPage 556 { 557 public: 558 // Constructor. 559 explicit TextPage(bool rawOrderA, bool discardDiagA = false); 560 561 TextPage(const TextPage &) = delete; 562 TextPage &operator=(const TextPage &) = delete; 563 564 void incRefCnt(); 565 void decRefCnt(); 566 567 // Start a new page. 568 void startPage(const GfxState *state); 569 570 // End the current page. 571 void endPage(); 572 573 // Update the current font. 574 void updateFont(const GfxState *state); 575 576 // Begin a new word. 577 void beginWord(const GfxState *state); 578 579 // Add a character to the current word. 580 void addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen); 581 582 // Add <nChars> invisible characters. 583 void incCharCount(int nChars); 584 585 // End the current word, sorting it into the list of words. 586 void endWord(); 587 588 // Add a word, sorting it into the list of words. 589 void addWord(TextWord *word); 590 591 // Add a (potential) underline. 592 void addUnderline(double x0, double y0, double x1, double y1); 593 594 // Add a hyperlink. 595 void addLink(int xMin, int yMin, int xMax, int yMax, AnnotLink *link); 596 597 // Coalesce strings that look like parts of the same line. 598 void coalesce(bool physLayout, double fixedPitch, bool doHTML); 599 600 // Find a string. If <startAtTop> is true, starts looking at the 601 // top of the page; else if <startAtLast> is true, starts looking 602 // immediately after the last find result; else starts looking at 603 // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the 604 // bottom of the page; else if <stopAtLast> is true, stops looking 605 // just before the last find result; else stops looking at 606 // <xMax>,<yMax>. 607 bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax); 608 609 // Adds new parameter ignoreDiacritics, which will do diacritics 610 // insensitive search, i.e. ignore accents, umlauts, diaeresis,etc. 611 // while matching. This option will be ignored if <s> contains characters 612 // which are not pure ascii. 613 bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, 614 double *yMax); 615 616 // Adds new parameter <matchAcrossLines>, which allows <s> to match on text 617 // spanning from end of a line to the next line. In that case, the rect for 618 // the part of match that falls on the next line will be stored in 619 // <continueMatch>, and if hyphenation (i.e. ignoring hyphen at end of line) 620 // was used while matching at the end of the line prior to <continueMatch>, 621 // then <ignoredHyphen> will be true, otherwise will be false. 622 // Only finding across two lines is supported, i.e. it won't match where <s> 623 // spans more than two lines. 624 // 625 // <matchAcrossLines> will be ignored if <backward> is true (as that 626 // combination has not been implemented yet). 627 bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool ignoreDiacritics, bool matchAcrossLines, bool backward, bool wholeWord, double *xMin, double *yMin, 628 double *xMax, double *yMax, PDFRectangle *continueMatch, bool *ignoredHyphen); 629 630 // Get the text which is inside the specified rectangle. 631 GooString *getText(double xMin, double yMin, double xMax, double yMax, EndOfLineKind textEOL) const; 632 633 void visitSelection(TextSelectionVisitor *visitor, const PDFRectangle *selection, SelectionStyle style); 634 635 void drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color); 636 637 std::vector<PDFRectangle *> *getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale); 638 639 GooString *getSelectionText(const PDFRectangle *selection, SelectionStyle style); 640 641 std::vector<TextWordSelection *> **getSelectionWords(const PDFRectangle *selection, SelectionStyle style, int *nLines); 642 643 // Find a string by character position and length. If found, sets 644 // the text bounding rectangle and returns true; otherwise returns 645 // false. 646 bool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const; 647 648 // Dump contents of page to a file. 649 void dump(void *outputStream, TextOutputFunc outputFunc, bool physLayout, EndOfLineKind textEOL, bool pageBreaks); 650 651 // Get the head of the linked list of TextFlows. getFlows()652 const TextFlow *getFlows() const { return flows; } 653 654 // If true, will combine characters when a base and combining 655 // character are drawn on eachother. 656 void setMergeCombining(bool merge); 657 658 #ifdef TEXTOUT_WORD_LIST 659 // Build a flat word list, in content stream order (if 660 // this->rawOrder is true), physical layout order (if <physLayout> 661 // is true and this->rawOrder is false), or reading order (if both 662 // flags are false). 663 std::unique_ptr<TextWordList> makeWordList(bool physLayout); 664 #endif 665 666 private: 667 // Destructor. 668 ~TextPage(); 669 670 void clear(); 671 void assignColumns(TextLineFrag *frags, int nFrags, bool rot) const; 672 int dumpFragment(const Unicode *text, int len, const UnicodeMap *uMap, GooString *s) const; 673 void adjustRotation(TextLine *line, int start, int end, double *xMin, double *xMax, double *yMin, double *yMax); 674 675 bool rawOrder; // keep text in content stream order 676 bool discardDiag; // discard diagonal text 677 bool mergeCombining; // merge when combining and base characters 678 // are drawn on top of each other 679 680 double pageWidth, pageHeight; // width and height of current page 681 TextWord *curWord; // currently active string 682 int charPos; // next character position (within content 683 // stream) 684 TextFontInfo *curFont; // current font 685 double curFontSize; // current font size 686 int nest; // current nesting level (for Type 3 fonts) 687 int nTinyChars; // number of "tiny" chars seen so far 688 bool lastCharOverlap; // set if the last added char overlapped the 689 // previous char 690 bool diagonal; // whether the current text is diagonal 691 692 std::unique_ptr<TextPool> pools[4]; // a "pool" of TextWords for each rotation 693 TextFlow *flows; // linked list of flows 694 TextBlock **blocks; // array of blocks, in yx order 695 int nBlocks; // number of blocks 696 int primaryRot; // primary rotation 697 bool primaryLR; // primary direction (true means L-to-R, 698 // false means R-to-L) 699 TextWord *rawWords; // list of words, in raw order (only if 700 // rawOrder is set) 701 TextWord *rawLastWord; // last word on rawWords list 702 703 std::vector<std::unique_ptr<TextFontInfo>> fonts; // all font info objects used on this page 704 705 double lastFindXMin, // coordinates of the last "find" result 706 lastFindYMin; 707 bool haveLastFind; 708 709 std::vector<std::unique_ptr<TextUnderline>> underlines; 710 std::vector<std::unique_ptr<TextLink>> links; 711 712 int refCnt; 713 714 friend class TextLine; 715 friend class TextLineFrag; 716 friend class TextBlock; 717 friend class TextFlow; 718 friend class TextWordList; 719 friend class TextSelectionPainter; 720 friend class TextSelectionDumper; 721 }; 722 723 //------------------------------------------------------------------------ 724 // ActualText 725 //------------------------------------------------------------------------ 726 727 class POPPLER_PRIVATE_EXPORT ActualText 728 { 729 public: 730 // Create an ActualText 731 explicit ActualText(TextPage *out); 732 ~ActualText(); 733 734 ActualText(const ActualText &) = delete; 735 ActualText &operator=(const ActualText &) = delete; 736 737 void addChar(const GfxState *state, double x, double y, double dx, double dy, CharCode c, int nBytes, const Unicode *u, int uLen); 738 void begin(const GfxState *state, const GooString *text); 739 void end(const GfxState *state); 740 741 private: 742 TextPage *text; 743 744 GooString *actualText; // replacement text for the span 745 double actualTextX0; 746 double actualTextY0; 747 double actualTextX1; 748 double actualTextY1; 749 int actualTextNBytes; 750 }; 751 752 //------------------------------------------------------------------------ 753 // TextOutputDev 754 //------------------------------------------------------------------------ 755 756 class POPPLER_PRIVATE_EXPORT TextOutputDev : public OutputDev 757 { 758 public: 759 // Open a text output file. If <fileName> is NULL, no file is 760 // written (this is useful, e.g., for searching text). If 761 // <physLayoutA> is true, the original physical layout of the text 762 // is maintained. If <rawOrder> is true, the text is kept in 763 // content stream order. If <discardDiag> is true, diagonal text 764 // is removed from output. 765 TextOutputDev(const char *fileName, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool append, bool discardDiagA = false); 766 767 // Create a TextOutputDev which will write to a generic stream. If 768 // <physLayoutA> is true, the original physical layout of the text 769 // is maintained. If <rawOrder> is true, the text is kept in 770 // content stream order. If <discardDiag> is true, diagonal text 771 // is removed from output. 772 TextOutputDev(TextOutputFunc func, void *stream, bool physLayoutA, double fixedPitchA, bool rawOrderA, bool discardDiagA = false); 773 774 // Destructor. 775 ~TextOutputDev() override; 776 777 // Check if file was successfully created. isOk()778 virtual bool isOk() { return ok; } 779 780 //---- get info about output device 781 782 // Does this device use upside-down coordinates? 783 // (Upside-down means (0,0) is the top left corner of the page.) upsideDown()784 bool upsideDown() override { return true; } 785 786 // Does this device use drawChar() or drawString()? useDrawChar()787 bool useDrawChar() override { return true; } 788 789 // Does this device use beginType3Char/endType3Char? Otherwise, 790 // text in Type 3 fonts will be drawn with drawChar/drawString. interpretType3Chars()791 bool interpretType3Chars() override { return false; } 792 793 // Does this device need non-text content? needNonText()794 bool needNonText() override { return false; } 795 796 // Does this device require incCharCount to be called for text on 797 // non-shown layers? needCharCount()798 bool needCharCount() override { return true; } 799 800 //----- initialization and control 801 802 // Start a page. 803 void startPage(int pageNum, GfxState *state, XRef *xref) override; 804 805 // End a page. 806 void endPage() override; 807 808 //----- save/restore graphics state 809 void restoreState(GfxState *state) override; 810 811 //----- update text state 812 void updateFont(GfxState *state) override; 813 814 //----- text drawing 815 void beginString(GfxState *state, const GooString *s) override; 816 void endString(GfxState *state) override; 817 void drawChar(GfxState *state, double x, double y, double dx, double dy, double originX, double originY, CharCode c, int nBytes, const Unicode *u, int uLen) override; 818 void incCharCount(int nChars) override; 819 void beginActualText(GfxState *state, const GooString *text) override; 820 void endActualText(GfxState *state) override; 821 822 //----- path painting 823 void stroke(GfxState *state) override; 824 void fill(GfxState *state) override; 825 void eoFill(GfxState *state) override; 826 827 //----- link borders 828 void processLink(AnnotLink *link) override; 829 830 //----- special access 831 832 // Find a string. If <startAtTop> is true, starts looking at the 833 // top of the page; else if <startAtLast> is true, starts looking 834 // immediately after the last find result; else starts looking at 835 // <xMin>,<yMin>. If <stopAtBottom> is true, stops looking at the 836 // bottom of the page; else if <stopAtLast> is true, stops looking 837 // just before the last find result; else stops looking at 838 // <xMax>,<yMax>. 839 bool findText(const Unicode *s, int len, bool startAtTop, bool stopAtBottom, bool startAtLast, bool stopAtLast, bool caseSensitive, bool backward, bool wholeWord, double *xMin, double *yMin, double *xMax, double *yMax) const; 840 841 // Get the text which is inside the specified rectangle. 842 GooString *getText(double xMin, double yMin, double xMax, double yMax) const; 843 844 // Find a string by character position and length. If found, sets 845 // the text bounding rectangle and returns true; otherwise returns 846 // false. 847 bool findCharRange(int pos, int length, double *xMin, double *yMin, double *xMax, double *yMax) const; 848 849 void drawSelection(OutputDev *out, double scale, int rotation, const PDFRectangle *selection, SelectionStyle style, const GfxColor *glyph_color, const GfxColor *box_color); 850 851 std::vector<PDFRectangle *> *getSelectionRegion(const PDFRectangle *selection, SelectionStyle style, double scale); 852 853 GooString *getSelectionText(const PDFRectangle *selection, SelectionStyle style); 854 855 // If true, will combine characters when a base and combining 856 // character are drawn on eachother. 857 void setMergeCombining(bool merge); 858 859 #ifdef TEXTOUT_WORD_LIST 860 // Build a flat word list, in content stream order (if 861 // this->rawOrder is true), physical layout order (if 862 // this->physLayout is true and this->rawOrder is false), or reading 863 // order (if both flags are false). 864 std::unique_ptr<TextWordList> makeWordList(); 865 #endif 866 867 // Returns the TextPage object for the last rasterized page, 868 // transferring ownership to the caller. 869 TextPage *takeText(); 870 871 // Turn extra processing for HTML conversion on or off. enableHTMLExtras(bool doHTMLA)872 void enableHTMLExtras(bool doHTMLA) { doHTML = doHTMLA; } 873 874 // Get the head of the linked list of TextFlows for the 875 // last rasterized page. 876 const TextFlow *getFlows() const; 877 defaultEndOfLine()878 static constexpr EndOfLineKind defaultEndOfLine() 879 { 880 #if defined(_WIN32) 881 return eolDOS; 882 #else 883 return eolUnix; 884 #endif 885 } setTextEOL(EndOfLineKind textEOLA)886 void setTextEOL(EndOfLineKind textEOLA) { textEOL = textEOLA; } setTextPageBreaks(bool textPageBreaksA)887 void setTextPageBreaks(bool textPageBreaksA) { textPageBreaks = textPageBreaksA; } 888 889 private: 890 TextOutputFunc outputFunc; // output function 891 void *outputStream; // output stream 892 bool needClose; // need to close the output file? 893 // (only if outputStream is a FILE*) 894 TextPage *text; // text for the current page 895 bool physLayout; // maintain original physical layout when 896 // dumping text 897 double fixedPitch; // if physLayout is true and this is non-zero, 898 // assume fixed-pitch characters with this 899 // width 900 bool rawOrder; // keep text in content stream order 901 bool discardDiag; // Diagonal text, i.e., text that is not close to one of the 902 // 0, 90, 180, or 270 degree axes, is discarded. This is useful 903 // to skip watermarks drawn on top of body text, etc. 904 bool doHTML; // extra processing for HTML conversion 905 bool ok; // set up ok? 906 bool textPageBreaks; // insert end-of-page markers? 907 EndOfLineKind textEOL; // type of EOL marker to use 908 909 ActualText *actualText; 910 }; 911 912 #endif 913