1 /*
2     SPDX-FileCopyrightText: 2005 Piotr Szymanski <niedakh@gmail.com>
3 
4     SPDX-License-Identifier: GPL-2.0-or-later
5 */
6 
7 #include "textpage.h"
8 #include "textpage_p.h"
9 
10 #include <QDebug>
11 
12 #include "area.h"
13 #include "debug_p.h"
14 #include "misc.h"
15 #include "page.h"
16 #include "page_p.h"
17 
18 #include <cstring>
19 
20 #include <QVarLengthArray>
21 #include <QtAlgorithms>
22 
23 using namespace Okular;
24 
25 class SearchPoint
26 {
27 public:
SearchPoint()28     SearchPoint()
29         : offset_begin(-1)
30         , offset_end(-1)
31     {
32     }
33 
34     /** The TinyTextEntity containing the first character of the match. */
35     TextList::ConstIterator it_begin;
36 
37     /** The TinyTextEntity containing the last character of the match. */
38     TextList::ConstIterator it_end;
39 
40     /** The index of the first character of the match in (*it_begin)->text().
41      *  Satisfies 0 <= offset_begin < (*it_begin)->text().length().
42      */
43     int offset_begin;
44 
45     /** One plus the index of the last character of the match in (*it_end)->text().
46      *  Satisfies 0 < offset_end <= (*it_end)->text().length().
47      */
48     int offset_end;
49 };
50 
51 /* text comparison functions */
52 
CaseInsensitiveCmpFn(const QStringRef & from,const QStringRef & to)53 static bool CaseInsensitiveCmpFn(const QStringRef &from, const QStringRef &to)
54 {
55 #ifdef DEBUG_TEXTPAGE
56     qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)";
57 #endif
58     return from.compare(to, Qt::CaseInsensitive) == 0;
59 }
60 
CaseSensitiveCmpFn(const QStringRef & from,const QStringRef & to)61 static bool CaseSensitiveCmpFn(const QStringRef &from, const QStringRef &to)
62 {
63 #ifdef DEBUG_TEXTPAGE
64     qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)";
65 #endif
66     return from.compare(to, Qt::CaseSensitive) == 0;
67 }
68 
69 /**
70  * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line
71  * overlap within @p threshold percent, i. e. iff the ratio of the length of the
72  * intersection of the segments to the length of the shortest of the two input segments
73  * is not smaller than the threshold.
74  */
segmentsOverlap(double left1,double right1,double left2,double right2,int threshold)75 static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold)
76 {
77     // check if one consumes another fully (speed optimization)
78 
79     if (left1 <= left2 && right1 >= right2)
80         return true;
81 
82     if (left1 >= left2 && right1 <= right2)
83         return true;
84 
85     // check if there is overlap above threshold
86     if (right2 >= left1 && right1 >= left2) {
87         double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1;
88 
89         double length1 = right1 - left1, length2 = right2 - left2;
90 
91         return overlap * 100 >= threshold * qMin(length1, length2);
92     }
93 
94     return false;
95 }
96 
doesConsumeY(const QRect first,const QRect second,int threshold)97 static bool doesConsumeY(const QRect first, const QRect second, int threshold)
98 {
99     return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold);
100 }
101 
doesConsumeY(const NormalizedRect & first,const NormalizedRect & second,int threshold)102 static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold)
103 {
104     return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold);
105 }
106 
107 /*
108   Rationale behind TinyTextEntity:
109 
110   instead of storing directly a QString for the text of an entity,
111   we store the UTF-16 data and their length. This way, we save about
112   4 int's wrt a QString, and we can create a new string from that
113   raw data (that's the only penalty of that).
114   Even better, if the string we need to store has at most
115   MaxStaticChars characters, then we store those in place of the QChar*
116   that would be used (with new[] + free[]) for the data.
117  */
118 class TinyTextEntity
119 {
120     static const int MaxStaticChars = sizeof(void *) / sizeof(QChar);
121 
122 public:
TinyTextEntity(const QString & text,const NormalizedRect & rect)123     TinyTextEntity(const QString &text, const NormalizedRect &rect)
124         : area(rect)
125     {
126         Q_ASSERT_X(!text.isEmpty(), "TinyTextEntity", "empty string");
127         Q_ASSERT_X(sizeof(d) == sizeof(void *), "TinyTextEntity", "internal storage is wider than QChar*, fix it!");
128         length = text.length();
129         switch (length) {
130 #if QT_POINTER_SIZE >= 8
131         case 4:
132             d.qc[3] = text.at(3).unicode();
133             // fall through
134         case 3:
135             d.qc[2] = text.at(2).unicode();
136 #endif
137             // fall through
138         case 2:
139             d.qc[1] = text.at(1).unicode();
140             // fall through
141         case 1:
142             d.qc[0] = text.at(0).unicode();
143             break;
144         default:
145             d.data = new QChar[length];
146             std::memcpy(d.data, text.constData(), length * sizeof(QChar));
147         }
148     }
149 
~TinyTextEntity()150     ~TinyTextEntity()
151     {
152         if (length > MaxStaticChars) {
153             delete[] d.data;
154         }
155     }
156 
text() const157     inline QString text() const
158     {
159         return length <= MaxStaticChars ? QString::fromRawData((const QChar *)&d.qc[0], length) : QString::fromRawData(d.data, length);
160     }
161 
transformedArea(const QTransform & matrix) const162     inline NormalizedRect transformedArea(const QTransform &matrix) const
163     {
164         NormalizedRect transformed_area = area;
165         transformed_area.transform(matrix);
166         return transformed_area;
167     }
168 
169     NormalizedRect area;
170 
171 private:
172     Q_DISABLE_COPY(TinyTextEntity)
173 
174     union {
175         QChar *data;
176         ushort qc[MaxStaticChars];
177     } d;
178     int length;
179 };
180 
TextEntity(const QString & text,NormalizedRect * area)181 TextEntity::TextEntity(const QString &text, NormalizedRect *area)
182     : m_text(text)
183     , m_area(area)
184     , d(nullptr)
185 {
186 }
187 
~TextEntity()188 TextEntity::~TextEntity()
189 {
190     delete m_area;
191 }
192 
text() const193 QString TextEntity::text() const
194 {
195     return m_text;
196 }
197 
area() const198 NormalizedRect *TextEntity::area() const
199 {
200     return m_area;
201 }
202 
transformedArea(const QTransform & matrix) const203 NormalizedRect TextEntity::transformedArea(const QTransform &matrix) const
204 {
205     NormalizedRect transformed_area = *m_area;
206     transformed_area.transform(matrix);
207     return transformed_area;
208 }
209 
TextPagePrivate()210 TextPagePrivate::TextPagePrivate()
211     : m_page(nullptr)
212 {
213 }
214 
~TextPagePrivate()215 TextPagePrivate::~TextPagePrivate()
216 {
217     qDeleteAll(m_searchPoints);
218     qDeleteAll(m_words);
219 }
220 
TextPage()221 TextPage::TextPage()
222     : d(new TextPagePrivate())
223 {
224 }
225 
TextPage(const TextEntity::List & words)226 TextPage::TextPage(const TextEntity::List &words)
227     : d(new TextPagePrivate())
228 {
229     TextEntity::List::ConstIterator it = words.constBegin(), itEnd = words.constEnd();
230     for (; it != itEnd; ++it) {
231         TextEntity *e = *it;
232         if (!e->text().isEmpty())
233             d->m_words.append(new TinyTextEntity(e->text(), *e->area()));
234         delete e;
235     }
236 }
237 
~TextPage()238 TextPage::~TextPage()
239 {
240     delete d;
241 }
242 
append(const QString & text,NormalizedRect * area)243 void TextPage::append(const QString &text, NormalizedRect *area)
244 {
245     if (!text.isEmpty()) {
246         if (!d->m_words.isEmpty()) {
247             TinyTextEntity *lastEntity = d->m_words.last();
248             const QString concatText = lastEntity->text() + text.normalized(QString::NormalizationForm_KC);
249             if (concatText != concatText.normalized(QString::NormalizationForm_KC)) {
250                 // If this happens it means that the new text + old one have combined, for example A and ◌̊  form Å
251                 NormalizedRect newArea = *area | lastEntity->area;
252                 delete area;
253                 delete lastEntity;
254                 d->m_words.removeLast();
255                 d->m_words.append(new TinyTextEntity(concatText.normalized(QString::NormalizationForm_KC), newArea));
256                 return;
257             }
258         }
259 
260         d->m_words.append(new TinyTextEntity(text.normalized(QString::NormalizationForm_KC), *area));
261     }
262     delete area;
263 }
264 
265 struct WordWithCharacters {
WordWithCharactersWordWithCharacters266     WordWithCharacters(TinyTextEntity *w, const TextList &c)
267         : word(w)
268         , characters(c)
269     {
270     }
271 
textWordWithCharacters272     inline QString text() const
273     {
274         return word->text();
275     }
276 
areaWordWithCharacters277     inline const NormalizedRect &area() const
278     {
279         return word->area;
280     }
281 
282     TinyTextEntity *word;
283     TextList characters;
284 };
285 typedef QList<WordWithCharacters> WordsWithCharacters;
286 
287 /**
288  * We will divide the whole page in some regions depending on the horizontal and
289  * vertical spacing among different regions. Each region will have an area and an
290  * associated WordsWithCharacters in sorted order.
291  */
292 class RegionText
293 {
294 public:
RegionText()295     RegionText() {};
296 
RegionText(const WordsWithCharacters & wordsWithCharacters,const QRect area)297     RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area)
298         : m_region_wordWithCharacters(wordsWithCharacters)
299         , m_area(area)
300     {
301     }
302 
string() const303     inline QString string() const
304     {
305         QString res;
306         for (const WordWithCharacters &word : m_region_wordWithCharacters) {
307             res += word.text();
308         }
309         return res;
310     }
311 
text() const312     inline WordsWithCharacters text() const
313     {
314         return m_region_wordWithCharacters;
315     }
316 
area() const317     inline QRect area() const
318     {
319         return m_area;
320     }
321 
setArea(const QRect area)322     inline void setArea(const QRect area)
323     {
324         m_area = area;
325     }
326 
setText(const WordsWithCharacters & wordsWithCharacters)327     inline void setText(const WordsWithCharacters &wordsWithCharacters)
328     {
329         m_region_wordWithCharacters = wordsWithCharacters;
330     }
331 
332 private:
333     WordsWithCharacters m_region_wordWithCharacters;
334     QRect m_area;
335 };
336 
textArea(TextSelection * sel) const337 RegularAreaRect *TextPage::textArea(TextSelection *sel) const
338 {
339     if (d->m_words.isEmpty())
340         return new RegularAreaRect();
341 
342     /**
343         It works like this:
344         There are two cursors, we need to select all the text between them. The coordinates are normalised, leftTop is (0,0)
345         rightBottom is (1,1), so for cursors start (sx,sy) and end (ex,ey) we start with finding text rectangles under those
346         points, if not we search for the first that is to the right to it in the same baseline, if none found, then we search
347         for the first rectangle with a baseline under the cursor, having two points that are the best rectangles to both
348         of the cursors: (rx,ry)x(tx,ty) for start and (ux,uy)x(vx,vy) for end, we do a
349         1. (rx,ry)x(1,ty)
350         2. (0,ty)x(1,uy)
351         3. (0,uy)x(vx,vy)
352 
353         To find the closest rectangle to cursor (cx,cy) we search for a rectangle that either contains the cursor
354         or that has a left border >= cx and bottom border >= cy.
355     */
356     RegularAreaRect *ret = new RegularAreaRect;
357 
358     PagePrivate *pagePrivate = PagePrivate::get(d->m_page);
359     const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
360     const double scaleX = d->m_page->width();
361     const double scaleY = d->m_page->height();
362 
363     NormalizedPoint startC = sel->start();
364     NormalizedPoint endC = sel->end();
365     NormalizedPoint temp;
366 
367     // if startPoint is right to endPoint swap them
368     if (startC.x > endC.x) {
369         temp = startC;
370         startC = endC;
371         endC = temp;
372     }
373 
374     // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document
375     const NormalizedRect boundingRect = d->m_page->boundingBox();
376     const QRect content = boundingRect.geometry(scaleX, scaleY);
377     const double minX = content.left();
378     const double maxX = content.right();
379     const double minY = content.top();
380     const double maxY = content.bottom();
381 
382     /**
383      * We will now find out the TinyTextEntity for the startRectangle and TinyTextEntity for
384      * the endRectangle. We have four cases:
385      *
386      * Case 1(a): both startpoint and endpoint are out of the bounding Rectangle and at one side, so the rectangle made of start
387      * and endPoint are outof the bounding rect (do not intersect)
388      *
389      * Case 1(b): both startpoint and endpoint are out of bounding rect, but they are in different side, so is their rectangle
390      *
391      * Case 2(a): find the rectangle which contains start and endpoint and having some
392      * TextEntity
393      *
394      * Case 2(b): if 2(a) fails (if startPoint and endPoint both are unchanged), then we check whether there is any
395      * TextEntity within the rect made by startPoint and endPoint
396      *
397      * Case 3: Now, we may have two type of selection.
398      * 1. startpoint is left-top of start_end and endpoint is right-bottom
399      * 2. startpoint is left-bottom of start_end and endpoint is top-right
400      *
401      * Also, as 2(b) is passed, we might have it,itEnd or both unchanged, but the fact is that we have
402      * text within them. so, we need to search for the best suitable textposition for start and end.
403      *
404      * Case 3(a): We search the nearest rectangle consisting of some
405      * TinyTextEntity right to or bottom of the startPoint for selection 01.
406      * And, for selection 02, we have to search for right and top
407      *
408      * Case 3(b): For endpoint, we have to find the point top of or left to
409      * endpoint if we have selection 01.
410      * Otherwise, the search will be left and bottom
411      */
412 
413     // we know that startC.x > endC.x, we need to decide which is top and which is bottom
414     const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y);
415 
416     // Case 1(a)
417     if (!boundingRect.intersects(start_end))
418         return ret;
419 
420     // case 1(b)
421     /**
422         note that, after swapping of start and end, we know that,
423         start is always left to end. but, we cannot say start is
424         positioned upper than end.
425     **/
426     else {
427         // if start is left to content rect take it to content rect boundary
428         if (startC.x * scaleX < minX)
429             startC.x = minX / scaleX;
430         if (endC.x * scaleX > maxX)
431             endC.x = maxX / scaleX;
432 
433         // if start is top to end (selection type 01)
434         if (startC.y * scaleY < minY)
435             startC.y = minY / scaleY;
436         if (endC.y * scaleY > maxY)
437             endC.y = maxY / scaleY;
438 
439         // if start is bottom to end (selection type 02)
440         if (startC.y * scaleY > maxY)
441             startC.y = maxY / scaleY;
442         if (endC.y * scaleY < minY)
443             endC.y = minY / scaleY;
444     }
445 
446     TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
447     TextList::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd;
448     const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight;
449 
450     NormalizedRect tmp;
451     // case 2(a)
452     for (; it != itEnd; ++it) {
453         tmp = (*it)->area;
454         if (tmp.contains(startC.x, startC.y)) {
455             start = it;
456         }
457         if (tmp.contains(endC.x, endC.y)) {
458             end = it;
459         }
460     }
461 
462     // case 2(b)
463     it = tmpIt;
464     if (start == it && end == itEnd) {
465         for (; it != itEnd; ++it) {
466             // is there any text rectangle within the start_end rect
467             tmp = (*it)->area;
468             if (start_end.intersects(tmp))
469                 break;
470         }
471 
472         // we have searched every text entities, but none is within the rectangle created by start and end
473         // so, no selection should be done
474         if (it == itEnd) {
475             return ret;
476         }
477     }
478     it = tmpIt;
479     bool selection_two_start = false;
480 
481     // case 3.a
482     if (start == it) {
483         bool flagV = false;
484         NormalizedRect rect;
485 
486         // selection type 01
487         if (startC.y <= endC.y) {
488             for (; it != itEnd; ++it) {
489                 rect = (*it)->area;
490                 rect.isBottom(startC) ? flagV = false : flagV = true;
491 
492                 if (flagV && rect.isRight(startC)) {
493                     start = it;
494                     break;
495                 }
496             }
497         }
498 
499         // selection type 02
500         else {
501             selection_two_start = true;
502             int distance = scaleX + scaleY + 100;
503             int count = 0;
504 
505             for (; it != itEnd; ++it) {
506                 rect = (*it)->area;
507 
508                 if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) {
509                     count++;
510                     QRect entRect = rect.geometry(scaleX, scaleY);
511                     int xdist, ydist;
512                     xdist = entRect.center().x() - startC.x * scaleX;
513                     ydist = entRect.center().y() - startC.y * scaleY;
514 
515                     // make them positive
516                     if (xdist < 0)
517                         xdist = -xdist;
518                     if (ydist < 0)
519                         ydist = -ydist;
520 
521                     if ((xdist + ydist) < distance) {
522                         distance = xdist + ydist;
523                         start = it;
524                     }
525                 }
526             }
527         }
528     }
529 
530     // case 3.b
531     if (end == itEnd) {
532         it = tmpIt;
533         itEnd = itEnd - 1;
534 
535         bool flagV = false;
536         NormalizedRect rect;
537 
538         if (startC.y <= endC.y) {
539             for (; itEnd >= it; itEnd--) {
540                 rect = (*itEnd)->area;
541                 rect.isTop(endC) ? flagV = false : flagV = true;
542 
543                 if (flagV && rect.isLeft(endC)) {
544                     end = itEnd;
545                     break;
546                 }
547             }
548         }
549 
550         else {
551             int distance = scaleX + scaleY + 100;
552             for (; itEnd >= it; itEnd--) {
553                 rect = (*itEnd)->area;
554 
555                 if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) {
556                     QRect entRect = rect.geometry(scaleX, scaleY);
557                     int xdist, ydist;
558                     xdist = entRect.center().x() - endC.x * scaleX;
559                     ydist = entRect.center().y() - endC.y * scaleY;
560 
561                     // make them positive
562                     if (xdist < 0)
563                         xdist = -xdist;
564                     if (ydist < 0)
565                         ydist = -ydist;
566 
567                     if ((xdist + ydist) < distance) {
568                         distance = xdist + ydist;
569                         end = itEnd;
570                     }
571                 }
572             }
573         }
574     }
575 
576     /* if start and end in selection 02 are in the same column, and we
577      start at an empty space we have to remove the selection of last
578      character
579     */
580     if (selection_two_start) {
581         if (start > end) {
582             start = start - 1;
583         }
584     }
585 
586     // if start is less than end swap them
587     if (start > end) {
588         it = start;
589         start = end;
590         end = it;
591     }
592 
593     // removes the possibility of crash, in case none of 1 to 3 is true
594     if (end == d->m_words.constEnd())
595         end--;
596 
597     for (; start <= end; start++) {
598         ret->appendShape((*start)->transformedArea(matrix), side);
599     }
600 
601     return ret;
602 }
603 
findText(int searchID,const QString & query,SearchDirection direct,Qt::CaseSensitivity caseSensitivity,const RegularAreaRect * area)604 RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
605 {
606     SearchDirection dir = direct;
607     // invalid search request
608     if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull()))
609         return nullptr;
610     TextList::ConstIterator start;
611     int start_offset = 0;
612     TextList::ConstIterator end;
613     const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID);
614     if (sIt == d->m_searchPoints.constEnd()) {
615         // if no previous run of this search is found, then set it to start
616         // from the beginning (respecting the search direction)
617         if (dir == NextResult)
618             dir = FromTop;
619         else if (dir == PreviousResult)
620             dir = FromBottom;
621     }
622     bool forward = true;
623     switch (dir) {
624     case FromTop:
625         start = d->m_words.constBegin();
626         start_offset = 0;
627         end = d->m_words.constEnd();
628         break;
629     case FromBottom:
630         start = d->m_words.constEnd();
631         start_offset = 0;
632         end = d->m_words.constBegin();
633         forward = false;
634         break;
635     case NextResult:
636         start = (*sIt)->it_end;
637         start_offset = (*sIt)->offset_end;
638         end = d->m_words.constEnd();
639         break;
640     case PreviousResult:
641         start = (*sIt)->it_begin;
642         start_offset = (*sIt)->offset_begin;
643         end = d->m_words.constBegin();
644         forward = false;
645         break;
646     };
647     RegularAreaRect *ret = nullptr;
648     const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn;
649     if (forward) {
650         ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end);
651     } else {
652         ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end);
653     }
654     return ret;
655 }
656 
657 // hyphenated '-' must be at the end of a word, so hyphenation means
658 // we have a '-' just followed by a '\n' character
659 // check if the string contains a '-' character
660 // if the '-' is the last entry
stringLengthAdaptedWithHyphen(const QString & str,const TextList::ConstIterator & it,const TextList::ConstIterator & textListEnd)661 static int stringLengthAdaptedWithHyphen(const QString &str, const TextList::ConstIterator &it, const TextList::ConstIterator &textListEnd)
662 {
663     const int len = str.length();
664 
665     // hyphenated '-' must be at the end of a word, so hyphenation means
666     // we have a '-' just followed by a '\n' character
667     // check if the string contains a '-' character
668     // if the '-' is the last entry
669     if (str.endsWith(QLatin1Char('-'))) {
670         // validity chek of it + 1
671         if ((it + 1) != textListEnd) {
672             // 1. if the next character is '\n'
673             const QString &lookahedStr = (*(it + 1))->text();
674             if (lookahedStr.startsWith(QLatin1Char('\n'))) {
675                 return len - 1;
676             }
677 
678             // 2. if the next word is in a different line or not
679             const NormalizedRect &hyphenArea = (*it)->area;
680             const NormalizedRect &lookaheadArea = (*(it + 1))->area;
681 
682             // lookahead to check whether both the '-' rect and next character rect overlap
683             if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) {
684                 return len - 1;
685             }
686         }
687     }
688     // else if it is the second last entry - for example in pdf format
689     else if (str.endsWith(QLatin1String("-\n"))) {
690         return len - 2;
691     }
692 
693     return len;
694 }
695 
searchPointToArea(const SearchPoint * sp)696 RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp)
697 {
698     PagePrivate *pagePrivate = PagePrivate::get(m_page);
699     const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
700     RegularAreaRect *ret = new RegularAreaRect;
701 
702     for (TextList::ConstIterator it = sp->it_begin;; it++) {
703         const TinyTextEntity *curEntity = *it;
704         ret->append(curEntity->transformedArea(matrix));
705 
706         if (it == sp->it_end) {
707             break;
708         }
709     }
710 
711     ret->simplify();
712     return ret;
713 }
714 
findTextInternalForward(int searchID,const QString & _query,TextComparisonFunction comparer,const TextList::ConstIterator & start,int start_offset,const TextList::ConstIterator & end)715 RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, const TextList::ConstIterator &start, int start_offset, const TextList::ConstIterator &end)
716 {
717     // normalize query search all unicode (including glyphs)
718     const QString query = _query.normalized(QString::NormalizationForm_KC);
719 
720     // j is the current position in our query
721     // queryLeft is the length of the query we have left to match
722     int j = 0, queryLeft = query.length();
723 
724     TextList::ConstIterator it = start;
725     int offset = start_offset;
726 
727     TextList::ConstIterator it_begin = TextList::ConstIterator();
728     int offset_begin = 0; // dummy initial value to suppress compiler warnings
729 
730     while (it != end) {
731         const TinyTextEntity *curEntity = *it;
732         const QString &str = curEntity->text();
733         const int strLen = str.length();
734         const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
735         // adjustedLen <= strLen
736 
737         if (offset >= strLen) {
738             it++;
739             offset = 0;
740             continue;
741         }
742 
743         if (it_begin == TextList::ConstIterator()) {
744             it_begin = it;
745             offset_begin = offset;
746         }
747 
748         // Let the user write the hyphen or not when searching for text
749         int matchedLen = -1;
750         for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
751             // we have equal (or less than) area of the query left as the length of the current
752             // entity
753             const int min = qMin(queryLeft, matchingLen - offset);
754             if (comparer(str.midRef(offset, min), query.midRef(j, min))) {
755                 matchedLen = min;
756                 break;
757             }
758         }
759 
760         if (matchedLen == -1) {
761             // we have not matched
762             // this means we do not have a complete match
763             // we need to get back to query start
764             // and continue the search from this place
765 #ifdef DEBUG_TEXTPAGE
766             qCDebug(OkularCoreDebug) << "\tnot matched";
767 #endif
768             j = 0;
769             queryLeft = query.length();
770             it = it_begin;
771             offset = offset_begin + 1;
772             it_begin = TextList::ConstIterator();
773         } else {
774             // we have a match
775             // move the current position in the query
776             // to the position after the length of this string
777             // we matched
778             // subtract the length of the current entity from
779             // the left length of the query
780 #ifdef DEBUG_TEXTPAGE
781             qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen;
782 #endif
783             j += matchedLen;
784             queryLeft -= matchedLen;
785 
786             if (queryLeft == 0) {
787                 // save or update the search point for the current searchID
788                 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
789                 if (sIt == m_searchPoints.end()) {
790                     sIt = m_searchPoints.insert(searchID, new SearchPoint);
791                 }
792                 SearchPoint *sp = *sIt;
793                 sp->it_begin = it_begin;
794                 sp->it_end = it;
795                 sp->offset_begin = offset_begin;
796                 sp->offset_end = offset + matchedLen;
797                 return searchPointToArea(sp);
798             }
799 
800             it++;
801             offset = 0;
802         }
803     }
804     // end of loop - it means that we've ended the textentities
805 
806     const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
807     if (sIt != m_searchPoints.end()) {
808         SearchPoint *sp = *sIt;
809         m_searchPoints.erase(sIt);
810         delete sp;
811     }
812     return nullptr;
813 }
814 
findTextInternalBackward(int searchID,const QString & _query,TextComparisonFunction comparer,const TextList::ConstIterator & start,int start_offset,const TextList::ConstIterator & end)815 RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, const TextList::ConstIterator &start, int start_offset, const TextList::ConstIterator &end)
816 {
817     // normalize query to search all unicode (including glyphs)
818     const QString query = _query.normalized(QString::NormalizationForm_KC);
819 
820     // j is the current position in our query
821     // len is the length of the string in TextEntity
822     // queryLeft is the length of the query we have left
823     int j = query.length(), queryLeft = query.length();
824 
825     TextList::ConstIterator it = start;
826     int offset = start_offset;
827 
828     TextList::ConstIterator it_begin = TextList::ConstIterator();
829     int offset_begin = 0; // dummy initial value to suppress compiler warnings
830 
831     while (true) {
832         if (offset <= 0) {
833             if (it == end) {
834                 break;
835             }
836             it--;
837         }
838 
839         const TinyTextEntity *curEntity = *it;
840         const QString &str = curEntity->text();
841         const int strLen = str.length();
842         const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
843         // adjustedLen <= strLen
844 
845         if (offset <= 0) {
846             offset = strLen;
847         }
848 
849         if (it_begin == TextList::ConstIterator()) {
850             it_begin = it;
851             offset_begin = offset;
852         }
853 
854         // Let the user write the hyphen or not when searching for text
855         int matchedLen = -1;
856         // we have equal (or less than) area of the query left as the length of the current
857         // entity
858         for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
859             const int hyphenOffset = (strLen - matchingLen);
860             const int min = qMin(queryLeft + hyphenOffset, offset);
861             if (comparer(str.midRef(offset - min, min - hyphenOffset), query.midRef(j - min + hyphenOffset, min - hyphenOffset))) {
862                 matchedLen = min - hyphenOffset;
863                 break;
864             }
865         }
866 
867         if (matchedLen == -1) {
868             // we have not matched
869             // this means we do not have a complete match
870             // we need to get back to query start
871             // and continue the search from this place
872 #ifdef DEBUG_TEXTPAGE
873             qCDebug(OkularCoreDebug) << "\tnot matched";
874 #endif
875 
876             j = query.length();
877             queryLeft = query.length();
878             it = it_begin;
879             offset = offset_begin - 1;
880             it_begin = TextList::ConstIterator();
881         } else {
882             // we have a match
883             // move the current position in the query
884             // to the position after the length of this string
885             // we matched
886             // subtract the length of the current entity from
887             // the left length of the query
888 #ifdef DEBUG_TEXTPAGE
889             qCDebug(OkularCoreDebug) << "\tmatched";
890 #endif
891             j -= matchedLen;
892             queryLeft -= matchedLen;
893 
894             if (queryLeft == 0) {
895                 // save or update the search point for the current searchID
896                 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
897                 if (sIt == m_searchPoints.end()) {
898                     sIt = m_searchPoints.insert(searchID, new SearchPoint);
899                 }
900                 SearchPoint *sp = *sIt;
901                 sp->it_begin = it;
902                 sp->it_end = it_begin;
903                 sp->offset_begin = offset - matchedLen;
904                 sp->offset_end = offset_begin;
905                 return searchPointToArea(sp);
906             }
907 
908             offset = 0;
909         }
910     }
911     // end of loop - it means that we've ended the textentities
912 
913     const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
914     if (sIt != m_searchPoints.end()) {
915         SearchPoint *sp = *sIt;
916         m_searchPoints.erase(sIt);
917         delete sp;
918     }
919     return nullptr;
920 }
921 
text(const RegularAreaRect * area) const922 QString TextPage::text(const RegularAreaRect *area) const
923 {
924     return text(area, AnyPixelTextAreaInclusionBehaviour);
925 }
926 
text(const RegularAreaRect * area,TextAreaInclusionBehaviour b) const927 QString TextPage::text(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
928 {
929     if (area && area->isNull())
930         return QString();
931 
932     TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
933     QString ret;
934     if (area) {
935         for (; it != itEnd; ++it) {
936             if (b == AnyPixelTextAreaInclusionBehaviour) {
937                 if (area->intersects((*it)->area)) {
938                     ret += (*it)->text();
939                 }
940             } else {
941                 NormalizedPoint center = (*it)->area.center();
942                 if (area->contains(center.x, center.y)) {
943                     ret += (*it)->text();
944                 }
945             }
946         }
947     } else {
948         for (; it != itEnd; ++it)
949             ret += (*it)->text();
950     }
951     return ret;
952 }
953 
compareTinyTextEntityX(const WordWithCharacters & first,const WordWithCharacters & second)954 static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second)
955 {
956     QRect firstArea = first.area().roundedGeometry(1000, 1000);
957     QRect secondArea = second.area().roundedGeometry(1000, 1000);
958 
959     return firstArea.left() < secondArea.left();
960 }
961 
compareTinyTextEntityY(const WordWithCharacters & first,const WordWithCharacters & second)962 static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second)
963 {
964     const QRect firstArea = first.area().roundedGeometry(1000, 1000);
965     const QRect secondArea = second.area().roundedGeometry(1000, 1000);
966 
967     return firstArea.top() < secondArea.top();
968 }
969 
970 /**
971  * Sets a new world list. Deleting the contents of the old one
972  */
setWordList(const TextList & list)973 void TextPagePrivate::setWordList(const TextList &list)
974 {
975     qDeleteAll(m_words);
976     m_words = list;
977 }
978 
979 /**
980  * Remove all the spaces in between texts. It will make all the generators
981  * same, whether they save spaces(like pdf) or not(like djvu).
982  */
removeSpace(TextList * words)983 static void removeSpace(TextList *words)
984 {
985     TextList::Iterator it = words->begin();
986     const QString str(QLatin1Char(' '));
987 
988     while (it != words->end()) {
989         if ((*it)->text() == str) {
990             it = words->erase(it);
991         } else {
992             ++it;
993         }
994     }
995 }
996 
997 /**
998  * We will read the TinyTextEntity from characters and try to create words from there.
999  * Note: characters might be already characters for some generators, but we will keep
1000  * the nomenclature characters for the generator produced data. The resulting
1001  * WordsWithCharacters memory has to be managed by the caller, both the
1002  * WordWithCharacters::word and WordWithCharacters::characters contents
1003  */
makeWordFromCharacters(const TextList & characters,int pageWidth,int pageHeight)1004 static WordsWithCharacters makeWordFromCharacters(const TextList &characters, int pageWidth, int pageHeight)
1005 {
1006     /**
1007      * We will traverse characters and try to create words from the TinyTextEntities in it.
1008      * We will search TinyTextEntity blocks and merge them until we get a
1009      * space between two consecutive TinyTextEntities. When we get a space
1010      * we can take it as a end of word. Then we store the word as a TinyTextEntity
1011      * and keep it in newList.
1012 
1013      * We create a RegionText named regionWord that contains the word and the characters associated with it and
1014      * a rectangle area of the element in newList.
1015 
1016      */
1017     WordsWithCharacters wordsWithCharacters;
1018 
1019     TextList::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt;
1020     int newLeft, newRight, newTop, newBottom;
1021     int index = 0;
1022 
1023     for (; it != itEnd; it++) {
1024         QString textString = (*it)->text();
1025         QString newString;
1026         QRect lineArea = (*it)->area.roundedGeometry(pageWidth, pageHeight), elementArea;
1027         TextList wordCharacters;
1028         tmpIt = it;
1029         int space = 0;
1030 
1031         while (!space) {
1032             if (!textString.isEmpty()) {
1033                 newString.append(textString);
1034 
1035                 // when textString is the start of the word
1036                 if (tmpIt == it) {
1037                     NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1038                     wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1039                 } else {
1040                     NormalizedRect newRect(elementArea, pageWidth, pageHeight);
1041                     wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1042                 }
1043             }
1044 
1045             ++it;
1046 
1047             /*
1048              we must have to put this line before the if condition of it==itEnd
1049              otherwise the last character can be missed
1050              */
1051             if (it == itEnd)
1052                 break;
1053             elementArea = (*it)->area.roundedGeometry(pageWidth, pageHeight);
1054             if (!doesConsumeY(elementArea, lineArea, 60)) {
1055                 --it;
1056                 break;
1057             }
1058 
1059             const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width();
1060             const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width();
1061 
1062             space = elementArea.left() - lineArea.right();
1063 
1064             if (space != 0) {
1065                 it--;
1066                 break;
1067             }
1068 
1069             newLeft = text_x1 < line_x1 ? text_x1 : line_x1;
1070             newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1071             newTop = text_y1 > line_y1 ? line_y1 : text_y1;
1072             newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1073 
1074             lineArea.setLeft(newLeft);
1075             lineArea.setTop(newTop);
1076             lineArea.setWidth(newRight - newLeft);
1077             lineArea.setHeight(newBottom - newTop);
1078 
1079             textString = (*it)->text();
1080         }
1081 
1082         // if newString is not empty, save it
1083         if (!newString.isEmpty()) {
1084             const NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1085             TinyTextEntity *word = new TinyTextEntity(newString.normalized(QString::NormalizationForm_KC), newRect);
1086             wordsWithCharacters.append(WordWithCharacters(word, wordCharacters));
1087 
1088             index++;
1089         }
1090 
1091         if (it == itEnd)
1092             break;
1093     }
1094 
1095     return wordsWithCharacters;
1096 }
1097 
1098 /**
1099  * Create Lines from the words and sort them
1100  */
makeAndSortLines(const WordsWithCharacters & wordsTmp,int pageWidth,int pageHeight)1101 QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight)
1102 {
1103     /**
1104      * We cannot assume that the generator will give us texts in the right order.
1105      * We can only assume that we will get texts in the page and their bounding
1106      * rectangle. The texts can be character, word, half-word anything.
1107      * So, we need to:
1108      **
1109      * 1. Sort rectangles/boxes containing texts by y0(top)
1110      * 2. Create textline where there is y overlap between TinyTextEntity 's
1111      * 3. Within each line sort the TinyTextEntity 's by x0(left)
1112      */
1113 
1114     QList<QPair<WordsWithCharacters, QRect>> lines;
1115 
1116     /*
1117      Make a new copy of the TextList in the words, so that the wordsTmp and lines do
1118      not contain same pointers for all the TinyTextEntity.
1119      */
1120     QList<WordWithCharacters> words = wordsTmp;
1121 
1122     // Step 1
1123     std::sort(words.begin(), words.end(), compareTinyTextEntityY);
1124 
1125     // Step 2
1126     QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end();
1127 
1128     // for every non-space texts(characters/words) in the textList
1129     for (; it != itEnd; it++) {
1130         const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight);
1131         bool found = false;
1132 
1133         for (QPair<WordsWithCharacters, QRect> &linesI : lines) {
1134             /* the line area which will be expanded
1135                line_rects is only necessary to preserve the topmin and bottommax of all
1136                the texts in the line, left and right is not necessary at all
1137             */
1138             QRect &lineArea = linesI.second;
1139             const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width();
1140             const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width();
1141 
1142             /*
1143                if the new text and the line has y overlapping parts of more than 70%,
1144                the text will be added to this line
1145              */
1146             if (doesConsumeY(elementArea, lineArea, 70)) {
1147                 WordsWithCharacters &line = linesI.first;
1148                 line.append(*it);
1149 
1150                 const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1;
1151                 const int newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1152                 const int newTop = line_y1 < text_y1 ? line_y1 : text_y1;
1153                 const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1154 
1155                 lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop);
1156                 found = true;
1157             }
1158 
1159             if (found)
1160                 break;
1161         }
1162 
1163         /* when we have found a new line create a new TextList containing
1164            only one element and append it to the lines
1165          */
1166         if (!found) {
1167             WordsWithCharacters tmp;
1168             tmp.append((*it));
1169             lines.append(QPair<WordsWithCharacters, QRect>(tmp, elementArea));
1170         }
1171     }
1172 
1173     // Step 3
1174     for (QPair<WordsWithCharacters, QRect> &line : lines) {
1175         WordsWithCharacters &list = line.first;
1176         std::sort(list.begin(), list.end(), compareTinyTextEntityX);
1177     }
1178 
1179     return lines;
1180 }
1181 
1182 /**
1183  * Calculate Statistical information from the lines we made previously
1184  */
calculateStatisticalInformation(const QList<WordWithCharacters> & words,int pageWidth,int pageHeight,int * word_spacing,int * line_spacing,int * col_spacing)1185 static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing)
1186 {
1187     /**
1188      * For the region, defined by line_rects and lines
1189      * 1. Make line statistical analysis to find the line spacing
1190      * 2. Make character statistical analysis to differentiate between
1191      *   word spacing and column spacing.
1192      */
1193 
1194     /**
1195      * Step 0
1196      */
1197     const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight);
1198 
1199     /**
1200      * Step 1
1201      */
1202     QMap<int, int> line_space_stat;
1203     for (int i = 0; i < sortedLines.length(); i++) {
1204         const QRect rectUpper = sortedLines.at(i).second;
1205 
1206         if (i + 1 == sortedLines.length())
1207             break;
1208         const QRect rectLower = sortedLines.at(i + 1).second;
1209 
1210         int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height());
1211         if (linespace < 0)
1212             linespace = -linespace;
1213 
1214         if (line_space_stat.contains(linespace))
1215             line_space_stat[linespace]++;
1216         else
1217             line_space_stat[linespace] = 1;
1218     }
1219 
1220     *line_spacing = 0;
1221     int weighted_count = 0;
1222     QMapIterator<int, int> iterate_linespace(line_space_stat);
1223 
1224     while (iterate_linespace.hasNext()) {
1225         iterate_linespace.next();
1226         *line_spacing += iterate_linespace.value() * iterate_linespace.key();
1227         weighted_count += iterate_linespace.value();
1228     }
1229     if (*line_spacing != 0)
1230         *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5);
1231 
1232     /**
1233      * Step 2
1234      */
1235     // We would like to use QMap instead of QHash as it will keep the keys sorted
1236     QMap<int, int> hor_space_stat;
1237     QMap<int, int> col_space_stat;
1238     QList<QList<QRect>> space_rects;
1239     QVector<QRect> max_hor_space_rects;
1240 
1241     // Space in every line
1242     for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1243         const WordsWithCharacters list = sortedLine.first;
1244         QList<QRect> line_space_rects;
1245         int maxSpace = 0, minSpace = pageWidth;
1246 
1247         // for every TinyTextEntity element in the line
1248         WordsWithCharacters::ConstIterator it = list.begin(), itEnd = list.end();
1249         QRect max_area1, max_area2;
1250         QString before_max, after_max;
1251 
1252         // for every line
1253         for (; it != itEnd; it++) {
1254             const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight);
1255             if (it + 1 == itEnd)
1256                 break;
1257 
1258             const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight);
1259             int space = area2.left() - area1.right();
1260 
1261             if (space > maxSpace) {
1262                 max_area1 = area1;
1263                 max_area2 = area2;
1264                 maxSpace = space;
1265                 before_max = (*it).text();
1266                 after_max = (*(it + 1)).text();
1267             }
1268 
1269             if (space < minSpace && space != 0)
1270                 minSpace = space;
1271 
1272             // if we found a real space, whose length is not zero and also less than the pageWidth
1273             if (space != 0 && space != pageWidth) {
1274                 // increase the count of the space amount
1275                 if (hor_space_stat.contains(space))
1276                     hor_space_stat[space]++;
1277                 else
1278                     hor_space_stat[space] = 1;
1279 
1280                 int left, right, top, bottom;
1281 
1282                 left = area1.right();
1283                 right = area2.left();
1284 
1285                 top = area2.top() < area1.top() ? area2.top() : area1.top();
1286                 bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1287 
1288                 QRect rect(left, top, right - left, bottom - top);
1289                 line_space_rects.append(rect);
1290             }
1291         }
1292 
1293         space_rects.append(line_space_rects);
1294 
1295         if (hor_space_stat.contains(maxSpace)) {
1296             if (hor_space_stat[maxSpace] != 1)
1297                 hor_space_stat[maxSpace]--;
1298             else
1299                 hor_space_stat.remove(maxSpace);
1300         }
1301 
1302         if (maxSpace != 0) {
1303             if (col_space_stat.contains(maxSpace))
1304                 col_space_stat[maxSpace]++;
1305             else
1306                 col_space_stat[maxSpace] = 1;
1307 
1308             // store the max rect of each line
1309             const int left = max_area1.right();
1310             const int right = max_area2.left();
1311             const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top();
1312             const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom();
1313 
1314             const QRect rect(left, top, right - left, bottom - top);
1315             max_hor_space_rects.append(rect);
1316         } else
1317             max_hor_space_rects.append(QRect(0, 0, 0, 0));
1318     }
1319 
1320     // All the between word space counts are in hor_space_stat
1321     *word_spacing = 0;
1322     weighted_count = 0;
1323     QMapIterator<int, int> iterate(hor_space_stat);
1324 
1325     while (iterate.hasNext()) {
1326         iterate.next();
1327 
1328         if (iterate.key() > 0) {
1329             *word_spacing += iterate.value() * iterate.key();
1330             weighted_count += iterate.value();
1331         }
1332     }
1333     if (weighted_count)
1334         *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5);
1335 
1336     *col_spacing = 0;
1337     QMapIterator<int, int> iterate_col(col_space_stat);
1338 
1339     while (iterate_col.hasNext()) {
1340         iterate_col.next();
1341         if (iterate_col.value() > *col_spacing)
1342             *col_spacing = iterate_col.value();
1343     }
1344     *col_spacing = col_space_stat.key(*col_spacing);
1345 
1346     // if there is just one line in a region, there is no point in dividing it
1347     if (sortedLines.length() == 1)
1348         *word_spacing = *col_spacing;
1349 }
1350 
1351 /**
1352  * Implements the XY Cut algorithm for textpage segmentation
1353  * The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and
1354  * WordsWithCharacters::characters are reused from wordsWithCharacters (i.e. no new nor delete happens in this function)
1355  */
XYCutForBoundingBoxes(const QList<WordWithCharacters> & wordsWithCharacters,int pageWidth,int pageHeight)1356 static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, int pageWidth, int pageHeight)
1357 {
1358     RegionTextList tree;
1359     QRect contentRect(0, 0, pageWidth, pageHeight);
1360     const RegionText root(wordsWithCharacters, contentRect);
1361 
1362     // start the tree with the root, it is our only region at the start
1363     tree.push_back(root);
1364 
1365     int i = 0;
1366 
1367     // while traversing the tree has not been ended
1368     while (i < tree.length()) {
1369         const RegionText node = tree.at(i);
1370         QRect regionRect = node.area();
1371 
1372         /**
1373          * 1. calculation of projection profiles
1374          */
1375         // allocate the size of proj profiles and initialize with 0
1376         int size_proj_y = node.area().height();
1377         int size_proj_x = node.area().width();
1378         // dynamic memory allocation
1379         QVarLengthArray<int> proj_on_xaxis(size_proj_x);
1380         QVarLengthArray<int> proj_on_yaxis(size_proj_y);
1381 
1382         for (int j = 0; j < size_proj_y; ++j)
1383             proj_on_yaxis[j] = 0;
1384         for (int j = 0; j < size_proj_x; ++j)
1385             proj_on_xaxis[j] = 0;
1386 
1387         const QList<WordWithCharacters> list = node.text();
1388 
1389         // Calculate tcx and tcy locally for each new region
1390         int word_spacing, line_spacing, column_spacing;
1391         calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing);
1392 
1393         const int tcx = word_spacing * 2;
1394         const int tcy = line_spacing * 2;
1395 
1396         int maxX = 0, maxY = 0;
1397         int avgX = 0;
1398         int count;
1399 
1400         // for every text in the region
1401         for (const WordWithCharacters &wwc : list) {
1402             TinyTextEntity *ent = wwc.word;
1403             const QRect entRect = ent->area.geometry(pageWidth, pageHeight);
1404 
1405             // calculate vertical projection profile proj_on_xaxis1
1406             for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) {
1407                 if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0)
1408                     proj_on_xaxis[k - regionRect.left()] += entRect.height();
1409             }
1410 
1411             // calculate horizontal projection profile in the same way
1412             for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) {
1413                 if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0)
1414                     proj_on_yaxis[k - regionRect.top()] += entRect.width();
1415             }
1416         }
1417 
1418         for (int j = 0; j < size_proj_y; ++j) {
1419             if (proj_on_yaxis[j] > maxY)
1420                 maxY = proj_on_yaxis[j];
1421         }
1422 
1423         avgX = count = 0;
1424         for (int j = 0; j < size_proj_x; ++j) {
1425             if (proj_on_xaxis[j] > maxX)
1426                 maxX = proj_on_xaxis[j];
1427             if (proj_on_xaxis[j]) {
1428                 count++;
1429                 avgX += proj_on_xaxis[j];
1430             }
1431         }
1432         if (count)
1433             avgX /= count;
1434 
1435         /**
1436          * 2. Cleanup Boundary White Spaces and removal of noise
1437          */
1438         int xbegin = 0, xend = size_proj_x - 1;
1439         int ybegin = 0, yend = size_proj_y - 1;
1440         while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0)
1441             xbegin++;
1442         while (xend >= 0 && proj_on_xaxis[xend] <= 0)
1443             xend--;
1444         while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0)
1445             ybegin++;
1446         while (yend >= 0 && proj_on_yaxis[yend] <= 0)
1447             yend--;
1448 
1449         // update the regionRect
1450         int old_left = regionRect.left(), old_top = regionRect.top();
1451         regionRect.setLeft(old_left + xbegin);
1452         regionRect.setRight(old_left + xend);
1453         regionRect.setTop(old_top + ybegin);
1454         regionRect.setBottom(old_top + yend);
1455 
1456         int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0;
1457         for (int j = 0; j < size_proj_x; ++j)
1458             proj_on_xaxis[j] -= tnx;
1459         for (int j = 0; j < size_proj_y; ++j)
1460             proj_on_yaxis[j] -= tny;
1461 
1462         /**
1463          * 3. Find the Widest gap
1464          */
1465         int gap_hor = -1, pos_hor = -1;
1466         int begin = -1, end = -1;
1467 
1468         // find all hor_gaps and find the maximum between them
1469         for (int j = 1; j < size_proj_y; ++j) {
1470             // transition from white to black
1471             if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0)
1472                 end = j;
1473 
1474             // transition from black to white
1475             if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0)
1476                 begin = j;
1477 
1478             if (begin > 0 && end > 0 && end - begin > gap_hor) {
1479                 gap_hor = end - begin;
1480                 pos_hor = (end + begin) / 2;
1481                 begin = -1;
1482                 end = -1;
1483             }
1484         }
1485 
1486         begin = -1, end = -1;
1487         int gap_ver = -1, pos_ver = -1;
1488 
1489         // find all the ver_gaps and find the maximum between them
1490         for (int j = 1; j < size_proj_x; ++j) {
1491             // transition from white to black
1492             if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) {
1493                 end = j;
1494             }
1495 
1496             // transition from black to white
1497             if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0)
1498                 begin = j;
1499 
1500             if (begin > 0 && end > 0 && end - begin > gap_ver) {
1501                 gap_ver = end - begin;
1502                 pos_ver = (end + begin) / 2;
1503                 begin = -1;
1504                 end = -1;
1505             }
1506         }
1507 
1508         int cut_pos_x = pos_ver, cut_pos_y = pos_hor;
1509         int gap_x = gap_ver, gap_y = gap_hor;
1510 
1511         /**
1512          * 4. Cut the region and make nodes (left,right) or (up,down)
1513          */
1514         bool cut_hor = false, cut_ver = false;
1515 
1516         // For horizontal cut
1517         const int topHeight = cut_pos_y - (regionRect.top() - old_top);
1518         const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight);
1519         const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight);
1520 
1521         // For vertical Cut
1522         const int leftWidth = cut_pos_x - (regionRect.left() - old_left);
1523         const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height());
1524         const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height());
1525 
1526         if (gap_y >= gap_x && gap_y >= tcy)
1527             cut_hor = true;
1528         else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx)
1529             cut_ver = true;
1530         else if (gap_x >= gap_y && gap_x >= tcx)
1531             cut_ver = true;
1532         else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy)
1533             cut_hor = true;
1534         // no cut possible
1535         else {
1536             // we can now update the node rectangle with the shrinked rectangle
1537             RegionText tmpNode = tree.at(i);
1538             tmpNode.setArea(regionRect);
1539             tree.replace(i, tmpNode);
1540             i++;
1541             continue;
1542         }
1543 
1544         WordsWithCharacters list1, list2;
1545 
1546         // horizontal cut, topRect and bottomRect
1547         if (cut_hor) {
1548             for (const WordWithCharacters &word : list) {
1549                 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1550 
1551                 if (topRect.intersects(wordRect))
1552                     list1.append(word);
1553                 else
1554                     list2.append(word);
1555             }
1556 
1557             RegionText node1(list1, topRect);
1558             RegionText node2(list2, bottomRect);
1559 
1560             tree.replace(i, node1);
1561             tree.insert(i + 1, node2);
1562         }
1563 
1564         // vertical cut, leftRect and rightRect
1565         else if (cut_ver) {
1566             for (const WordWithCharacters &word : list) {
1567                 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1568 
1569                 if (leftRect.intersects(wordRect))
1570                     list1.append(word);
1571                 else
1572                     list2.append(word);
1573             }
1574 
1575             RegionText node1(list1, leftRect);
1576             RegionText node2(list2, rightRect);
1577 
1578             tree.replace(i, node1);
1579             tree.insert(i + 1, node2);
1580         }
1581     }
1582 
1583     return tree;
1584 }
1585 
1586 /**
1587  * Add spaces in between words in a line. It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed
1588  */
addNecessarySpace(RegionTextList tree,int pageWidth,int pageHeight)1589 WordsWithCharacters addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight)
1590 {
1591     /**
1592      * 1. Call makeAndSortLines before adding spaces in between words in a line
1593      * 2. Now add spaces between every two words in a line
1594      * 3. Finally, extract all the space separated texts from each region and return it
1595      */
1596 
1597     // Only change the texts under RegionTexts, not the area
1598     for (RegionText &tmpRegion : tree) {
1599         // Step 01
1600         QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight);
1601 
1602         // Step 02
1603         for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1604             WordsWithCharacters &list = sortedLine.first;
1605             for (int k = 0; k < list.length(); k++) {
1606                 const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight);
1607                 if (k + 1 >= list.length())
1608                     break;
1609 
1610                 const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight);
1611                 const int space = area2.left() - area1.right();
1612 
1613                 if (space != 0) {
1614                     // Make a TinyTextEntity of string space and push it between it and it+1
1615                     const int left = area1.right();
1616                     const int right = area2.left();
1617                     const int top = area2.top() < area1.top() ? area2.top() : area1.top();
1618                     const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1619 
1620                     const QString spaceStr(QStringLiteral(" "));
1621                     const QRect rect(QPoint(left, top), QPoint(right, bottom));
1622                     const NormalizedRect entRect(rect, pageWidth, pageHeight);
1623                     TinyTextEntity *ent1 = new TinyTextEntity(spaceStr, entRect);
1624                     TinyTextEntity *ent2 = new TinyTextEntity(spaceStr, entRect);
1625                     WordWithCharacters word(ent1, QList<TinyTextEntity *>() << ent2);
1626 
1627                     list.insert(k + 1, word);
1628 
1629                     // Skip the space
1630                     k++;
1631                 }
1632             }
1633         }
1634 
1635         WordsWithCharacters tmpList;
1636         for (const QPair<WordsWithCharacters, QRect> &sortedLine : qAsConst(sortedLines)) {
1637             tmpList += sortedLine.first;
1638         }
1639         tmpRegion.setText(tmpList);
1640     }
1641 
1642     // Step 03
1643     WordsWithCharacters tmp;
1644     for (const RegionText &tmpRegion : qAsConst(tree)) {
1645         tmp += tmpRegion.text();
1646     }
1647     return tmp;
1648 }
1649 
1650 /**
1651  * Correct the textOrder, all layout recognition works here
1652  */
correctTextOrder()1653 void TextPagePrivate::correctTextOrder()
1654 {
1655     // m_page->width() and m_page->height() are in pixels at
1656     // 100% zoom level, and thus depend on display DPI.
1657     // To avoid Okular failing on lowDPI displays,
1658     // we scale pageWidth and pageHeight so their sum equals 2000.
1659     const double scalingFactor = 2000.0 / (m_page->width() + m_page->height());
1660     const int pageWidth = (int)(scalingFactor * m_page->width());
1661     const int pageHeight = (int)(scalingFactor * m_page->height());
1662 
1663     TextList characters = m_words;
1664 
1665     /**
1666      * Remove spaces from the text
1667      */
1668     removeSpace(&characters);
1669 
1670     /**
1671      * Construct words from characters
1672      */
1673     const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight);
1674 
1675     /**
1676      * Make a XY Cut tree for segmentation of the texts
1677      */
1678     const RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, pageWidth, pageHeight);
1679 
1680     /**
1681      * Add spaces to the word
1682      */
1683     const WordsWithCharacters listWithWordsAndSpaces = addNecessarySpace(tree, pageWidth, pageHeight);
1684 
1685     /**
1686      * Break the words into characters
1687      */
1688     TextList listOfCharacters;
1689     for (const WordWithCharacters &word : listWithWordsAndSpaces) {
1690         delete word.word;
1691         listOfCharacters.append(word.characters);
1692     }
1693     setWordList(listOfCharacters);
1694 }
1695 
words(const RegularAreaRect * area,TextAreaInclusionBehaviour b) const1696 TextEntity::List TextPage::words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
1697 {
1698     if (area && area->isNull())
1699         return TextEntity::List();
1700 
1701     TextEntity::List ret;
1702     if (area) {
1703         for (const TinyTextEntity *te : qAsConst(d->m_words)) {
1704             if (b == AnyPixelTextAreaInclusionBehaviour) {
1705                 if (area->intersects(te->area)) {
1706                     ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1707                 }
1708             } else {
1709                 const NormalizedPoint center = te->area.center();
1710                 if (area->contains(center.x, center.y)) {
1711                     ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1712                 }
1713             }
1714         }
1715     } else {
1716         for (const TinyTextEntity *te : qAsConst(d->m_words)) {
1717             ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1718         }
1719     }
1720     return ret;
1721 }
1722 
wordAt(const NormalizedPoint & p,QString * word) const1723 RegularAreaRect *TextPage::wordAt(const NormalizedPoint &p, QString *word) const
1724 {
1725     TextList::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
1726     TextList::ConstIterator it = itBegin;
1727     TextList::ConstIterator posIt = itEnd;
1728     for (; it != itEnd; ++it) {
1729         if ((*it)->area.contains(p.x, p.y)) {
1730             posIt = it;
1731             break;
1732         }
1733     }
1734     QString text;
1735     if (posIt != itEnd) {
1736         if ((*posIt)->text().simplified().isEmpty()) {
1737             return nullptr;
1738         }
1739         // Find the first TinyTextEntity of the word
1740         while (posIt != itBegin) {
1741             --posIt;
1742             const QString itText = (*posIt)->text();
1743             if (itText.right(1).at(0).isSpace()) {
1744                 if (itText.endsWith(QLatin1String("-\n"))) {
1745                     // Is an hyphenated word
1746                     // continue searching the start of the word back
1747                     continue;
1748                 }
1749 
1750                 if (itText == QLatin1String("\n") && posIt != itBegin) {
1751                     --posIt;
1752                     if ((*posIt)->text().endsWith(QLatin1String("-"))) {
1753                         // Is an hyphenated word
1754                         // continue searching the start of the word back
1755                         continue;
1756                     }
1757                     ++posIt;
1758                 }
1759 
1760                 ++posIt;
1761                 break;
1762             }
1763         }
1764         RegularAreaRect *ret = new RegularAreaRect();
1765         for (; posIt != itEnd; ++posIt) {
1766             const QString itText = (*posIt)->text();
1767             if (itText.simplified().isEmpty()) {
1768                 break;
1769             }
1770 
1771             ret->appendShape((*posIt)->area);
1772             text += (*posIt)->text();
1773             if (itText.right(1).at(0).isSpace()) {
1774                 if (!text.endsWith(QLatin1String("-\n"))) {
1775                     break;
1776                 }
1777             }
1778         }
1779 
1780         if (word) {
1781             *word = text;
1782         }
1783         return ret;
1784     } else {
1785         return nullptr;
1786     }
1787 }
1788