1 /*
2 SPDX-FileCopyrightText: 2005 Piotr Szymanski <niedakh@gmail.com>
3
4 SPDX-License-Identifier: GPL-2.0-or-later
5 */
6
7 #include "textpage.h"
8 #include "textpage_p.h"
9
10 #include <QDebug>
11
12 #include "area.h"
13 #include "debug_p.h"
14 #include "misc.h"
15 #include "page.h"
16 #include "page_p.h"
17
18 #include <cstring>
19
20 #include <QVarLengthArray>
21 #include <QtAlgorithms>
22
23 using namespace Okular;
24
25 class SearchPoint
26 {
27 public:
SearchPoint()28 SearchPoint()
29 : offset_begin(-1)
30 , offset_end(-1)
31 {
32 }
33
34 /** The TinyTextEntity containing the first character of the match. */
35 TextList::ConstIterator it_begin;
36
37 /** The TinyTextEntity containing the last character of the match. */
38 TextList::ConstIterator it_end;
39
40 /** The index of the first character of the match in (*it_begin)->text().
41 * Satisfies 0 <= offset_begin < (*it_begin)->text().length().
42 */
43 int offset_begin;
44
45 /** One plus the index of the last character of the match in (*it_end)->text().
46 * Satisfies 0 < offset_end <= (*it_end)->text().length().
47 */
48 int offset_end;
49 };
50
51 /* text comparison functions */
52
CaseInsensitiveCmpFn(const QStringRef & from,const QStringRef & to)53 static bool CaseInsensitiveCmpFn(const QStringRef &from, const QStringRef &to)
54 {
55 #ifdef DEBUG_TEXTPAGE
56 qDebug(OkularCoreDebug) << from << ":" << to << "(case insensitive)";
57 #endif
58 return from.compare(to, Qt::CaseInsensitive) == 0;
59 }
60
CaseSensitiveCmpFn(const QStringRef & from,const QStringRef & to)61 static bool CaseSensitiveCmpFn(const QStringRef &from, const QStringRef &to)
62 {
63 #ifdef DEBUG_TEXTPAGE
64 qDebug(OkularCoreDebug) << from << ":" << to << "(case sensitive)";
65 #endif
66 return from.compare(to, Qt::CaseSensitive) == 0;
67 }
68
69 /**
70 * Returns true iff segments [@p left1, @p right1] and [@p left2, @p right2] on the real line
71 * overlap within @p threshold percent, i. e. iff the ratio of the length of the
72 * intersection of the segments to the length of the shortest of the two input segments
73 * is not smaller than the threshold.
74 */
segmentsOverlap(double left1,double right1,double left2,double right2,int threshold)75 static bool segmentsOverlap(double left1, double right1, double left2, double right2, int threshold)
76 {
77 // check if one consumes another fully (speed optimization)
78
79 if (left1 <= left2 && right1 >= right2)
80 return true;
81
82 if (left1 >= left2 && right1 <= right2)
83 return true;
84
85 // check if there is overlap above threshold
86 if (right2 >= left1 && right1 >= left2) {
87 double overlap = (right2 >= right1) ? right1 - left2 : right2 - left1;
88
89 double length1 = right1 - left1, length2 = right2 - left2;
90
91 return overlap * 100 >= threshold * qMin(length1, length2);
92 }
93
94 return false;
95 }
96
doesConsumeY(const QRect first,const QRect second,int threshold)97 static bool doesConsumeY(const QRect first, const QRect second, int threshold)
98 {
99 return segmentsOverlap(first.top(), first.bottom(), second.top(), second.bottom(), threshold);
100 }
101
doesConsumeY(const NormalizedRect & first,const NormalizedRect & second,int threshold)102 static bool doesConsumeY(const NormalizedRect &first, const NormalizedRect &second, int threshold)
103 {
104 return segmentsOverlap(first.top, first.bottom, second.top, second.bottom, threshold);
105 }
106
107 /*
108 Rationale behind TinyTextEntity:
109
110 instead of storing directly a QString for the text of an entity,
111 we store the UTF-16 data and their length. This way, we save about
112 4 int's wrt a QString, and we can create a new string from that
113 raw data (that's the only penalty of that).
114 Even better, if the string we need to store has at most
115 MaxStaticChars characters, then we store those in place of the QChar*
116 that would be used (with new[] + free[]) for the data.
117 */
118 class TinyTextEntity
119 {
120 static const int MaxStaticChars = sizeof(void *) / sizeof(QChar);
121
122 public:
TinyTextEntity(const QString & text,const NormalizedRect & rect)123 TinyTextEntity(const QString &text, const NormalizedRect &rect)
124 : area(rect)
125 {
126 Q_ASSERT_X(!text.isEmpty(), "TinyTextEntity", "empty string");
127 Q_ASSERT_X(sizeof(d) == sizeof(void *), "TinyTextEntity", "internal storage is wider than QChar*, fix it!");
128 length = text.length();
129 switch (length) {
130 #if QT_POINTER_SIZE >= 8
131 case 4:
132 d.qc[3] = text.at(3).unicode();
133 // fall through
134 case 3:
135 d.qc[2] = text.at(2).unicode();
136 #endif
137 // fall through
138 case 2:
139 d.qc[1] = text.at(1).unicode();
140 // fall through
141 case 1:
142 d.qc[0] = text.at(0).unicode();
143 break;
144 default:
145 d.data = new QChar[length];
146 std::memcpy(d.data, text.constData(), length * sizeof(QChar));
147 }
148 }
149
~TinyTextEntity()150 ~TinyTextEntity()
151 {
152 if (length > MaxStaticChars) {
153 delete[] d.data;
154 }
155 }
156
text() const157 inline QString text() const
158 {
159 return length <= MaxStaticChars ? QString::fromRawData((const QChar *)&d.qc[0], length) : QString::fromRawData(d.data, length);
160 }
161
transformedArea(const QTransform & matrix) const162 inline NormalizedRect transformedArea(const QTransform &matrix) const
163 {
164 NormalizedRect transformed_area = area;
165 transformed_area.transform(matrix);
166 return transformed_area;
167 }
168
169 NormalizedRect area;
170
171 private:
172 Q_DISABLE_COPY(TinyTextEntity)
173
174 union {
175 QChar *data;
176 ushort qc[MaxStaticChars];
177 } d;
178 int length;
179 };
180
TextEntity(const QString & text,NormalizedRect * area)181 TextEntity::TextEntity(const QString &text, NormalizedRect *area)
182 : m_text(text)
183 , m_area(area)
184 , d(nullptr)
185 {
186 }
187
~TextEntity()188 TextEntity::~TextEntity()
189 {
190 delete m_area;
191 }
192
text() const193 QString TextEntity::text() const
194 {
195 return m_text;
196 }
197
area() const198 NormalizedRect *TextEntity::area() const
199 {
200 return m_area;
201 }
202
transformedArea(const QTransform & matrix) const203 NormalizedRect TextEntity::transformedArea(const QTransform &matrix) const
204 {
205 NormalizedRect transformed_area = *m_area;
206 transformed_area.transform(matrix);
207 return transformed_area;
208 }
209
TextPagePrivate()210 TextPagePrivate::TextPagePrivate()
211 : m_page(nullptr)
212 {
213 }
214
~TextPagePrivate()215 TextPagePrivate::~TextPagePrivate()
216 {
217 qDeleteAll(m_searchPoints);
218 qDeleteAll(m_words);
219 }
220
TextPage()221 TextPage::TextPage()
222 : d(new TextPagePrivate())
223 {
224 }
225
TextPage(const TextEntity::List & words)226 TextPage::TextPage(const TextEntity::List &words)
227 : d(new TextPagePrivate())
228 {
229 TextEntity::List::ConstIterator it = words.constBegin(), itEnd = words.constEnd();
230 for (; it != itEnd; ++it) {
231 TextEntity *e = *it;
232 if (!e->text().isEmpty())
233 d->m_words.append(new TinyTextEntity(e->text(), *e->area()));
234 delete e;
235 }
236 }
237
~TextPage()238 TextPage::~TextPage()
239 {
240 delete d;
241 }
242
append(const QString & text,NormalizedRect * area)243 void TextPage::append(const QString &text, NormalizedRect *area)
244 {
245 if (!text.isEmpty()) {
246 if (!d->m_words.isEmpty()) {
247 TinyTextEntity *lastEntity = d->m_words.last();
248 const QString concatText = lastEntity->text() + text.normalized(QString::NormalizationForm_KC);
249 if (concatText != concatText.normalized(QString::NormalizationForm_KC)) {
250 // If this happens it means that the new text + old one have combined, for example A and ◌̊ form Å
251 NormalizedRect newArea = *area | lastEntity->area;
252 delete area;
253 delete lastEntity;
254 d->m_words.removeLast();
255 d->m_words.append(new TinyTextEntity(concatText.normalized(QString::NormalizationForm_KC), newArea));
256 return;
257 }
258 }
259
260 d->m_words.append(new TinyTextEntity(text.normalized(QString::NormalizationForm_KC), *area));
261 }
262 delete area;
263 }
264
265 struct WordWithCharacters {
WordWithCharactersWordWithCharacters266 WordWithCharacters(TinyTextEntity *w, const TextList &c)
267 : word(w)
268 , characters(c)
269 {
270 }
271
textWordWithCharacters272 inline QString text() const
273 {
274 return word->text();
275 }
276
areaWordWithCharacters277 inline const NormalizedRect &area() const
278 {
279 return word->area;
280 }
281
282 TinyTextEntity *word;
283 TextList characters;
284 };
285 typedef QList<WordWithCharacters> WordsWithCharacters;
286
287 /**
288 * We will divide the whole page in some regions depending on the horizontal and
289 * vertical spacing among different regions. Each region will have an area and an
290 * associated WordsWithCharacters in sorted order.
291 */
292 class RegionText
293 {
294 public:
RegionText()295 RegionText() {};
296
RegionText(const WordsWithCharacters & wordsWithCharacters,const QRect area)297 RegionText(const WordsWithCharacters &wordsWithCharacters, const QRect area)
298 : m_region_wordWithCharacters(wordsWithCharacters)
299 , m_area(area)
300 {
301 }
302
string() const303 inline QString string() const
304 {
305 QString res;
306 for (const WordWithCharacters &word : m_region_wordWithCharacters) {
307 res += word.text();
308 }
309 return res;
310 }
311
text() const312 inline WordsWithCharacters text() const
313 {
314 return m_region_wordWithCharacters;
315 }
316
area() const317 inline QRect area() const
318 {
319 return m_area;
320 }
321
setArea(const QRect area)322 inline void setArea(const QRect area)
323 {
324 m_area = area;
325 }
326
setText(const WordsWithCharacters & wordsWithCharacters)327 inline void setText(const WordsWithCharacters &wordsWithCharacters)
328 {
329 m_region_wordWithCharacters = wordsWithCharacters;
330 }
331
332 private:
333 WordsWithCharacters m_region_wordWithCharacters;
334 QRect m_area;
335 };
336
textArea(TextSelection * sel) const337 RegularAreaRect *TextPage::textArea(TextSelection *sel) const
338 {
339 if (d->m_words.isEmpty())
340 return new RegularAreaRect();
341
342 /**
343 It works like this:
344 There are two cursors, we need to select all the text between them. The coordinates are normalised, leftTop is (0,0)
345 rightBottom is (1,1), so for cursors start (sx,sy) and end (ex,ey) we start with finding text rectangles under those
346 points, if not we search for the first that is to the right to it in the same baseline, if none found, then we search
347 for the first rectangle with a baseline under the cursor, having two points that are the best rectangles to both
348 of the cursors: (rx,ry)x(tx,ty) for start and (ux,uy)x(vx,vy) for end, we do a
349 1. (rx,ry)x(1,ty)
350 2. (0,ty)x(1,uy)
351 3. (0,uy)x(vx,vy)
352
353 To find the closest rectangle to cursor (cx,cy) we search for a rectangle that either contains the cursor
354 or that has a left border >= cx and bottom border >= cy.
355 */
356 RegularAreaRect *ret = new RegularAreaRect;
357
358 PagePrivate *pagePrivate = PagePrivate::get(d->m_page);
359 const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
360 const double scaleX = d->m_page->width();
361 const double scaleY = d->m_page->height();
362
363 NormalizedPoint startC = sel->start();
364 NormalizedPoint endC = sel->end();
365 NormalizedPoint temp;
366
367 // if startPoint is right to endPoint swap them
368 if (startC.x > endC.x) {
369 temp = startC;
370 startC = endC;
371 endC = temp;
372 }
373
374 // minX,maxX,minY,maxY gives the bounding rectangle coordinates of the document
375 const NormalizedRect boundingRect = d->m_page->boundingBox();
376 const QRect content = boundingRect.geometry(scaleX, scaleY);
377 const double minX = content.left();
378 const double maxX = content.right();
379 const double minY = content.top();
380 const double maxY = content.bottom();
381
382 /**
383 * We will now find out the TinyTextEntity for the startRectangle and TinyTextEntity for
384 * the endRectangle. We have four cases:
385 *
386 * Case 1(a): both startpoint and endpoint are out of the bounding Rectangle and at one side, so the rectangle made of start
387 * and endPoint are outof the bounding rect (do not intersect)
388 *
389 * Case 1(b): both startpoint and endpoint are out of bounding rect, but they are in different side, so is their rectangle
390 *
391 * Case 2(a): find the rectangle which contains start and endpoint and having some
392 * TextEntity
393 *
394 * Case 2(b): if 2(a) fails (if startPoint and endPoint both are unchanged), then we check whether there is any
395 * TextEntity within the rect made by startPoint and endPoint
396 *
397 * Case 3: Now, we may have two type of selection.
398 * 1. startpoint is left-top of start_end and endpoint is right-bottom
399 * 2. startpoint is left-bottom of start_end and endpoint is top-right
400 *
401 * Also, as 2(b) is passed, we might have it,itEnd or both unchanged, but the fact is that we have
402 * text within them. so, we need to search for the best suitable textposition for start and end.
403 *
404 * Case 3(a): We search the nearest rectangle consisting of some
405 * TinyTextEntity right to or bottom of the startPoint for selection 01.
406 * And, for selection 02, we have to search for right and top
407 *
408 * Case 3(b): For endpoint, we have to find the point top of or left to
409 * endpoint if we have selection 01.
410 * Otherwise, the search will be left and bottom
411 */
412
413 // we know that startC.x > endC.x, we need to decide which is top and which is bottom
414 const NormalizedRect start_end = (startC.y < endC.y) ? NormalizedRect(startC.x, startC.y, endC.x, endC.y) : NormalizedRect(startC.x, endC.y, endC.x, startC.y);
415
416 // Case 1(a)
417 if (!boundingRect.intersects(start_end))
418 return ret;
419
420 // case 1(b)
421 /**
422 note that, after swapping of start and end, we know that,
423 start is always left to end. but, we cannot say start is
424 positioned upper than end.
425 **/
426 else {
427 // if start is left to content rect take it to content rect boundary
428 if (startC.x * scaleX < minX)
429 startC.x = minX / scaleX;
430 if (endC.x * scaleX > maxX)
431 endC.x = maxX / scaleX;
432
433 // if start is top to end (selection type 01)
434 if (startC.y * scaleY < minY)
435 startC.y = minY / scaleY;
436 if (endC.y * scaleY > maxY)
437 endC.y = maxY / scaleY;
438
439 // if start is bottom to end (selection type 02)
440 if (startC.y * scaleY > maxY)
441 startC.y = maxY / scaleY;
442 if (endC.y * scaleY < minY)
443 endC.y = minY / scaleY;
444 }
445
446 TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
447 TextList::ConstIterator start = it, end = itEnd, tmpIt = it; //, tmpItEnd = itEnd;
448 const MergeSide side = d->m_page ? (MergeSide)d->m_page->totalOrientation() : MergeRight;
449
450 NormalizedRect tmp;
451 // case 2(a)
452 for (; it != itEnd; ++it) {
453 tmp = (*it)->area;
454 if (tmp.contains(startC.x, startC.y)) {
455 start = it;
456 }
457 if (tmp.contains(endC.x, endC.y)) {
458 end = it;
459 }
460 }
461
462 // case 2(b)
463 it = tmpIt;
464 if (start == it && end == itEnd) {
465 for (; it != itEnd; ++it) {
466 // is there any text rectangle within the start_end rect
467 tmp = (*it)->area;
468 if (start_end.intersects(tmp))
469 break;
470 }
471
472 // we have searched every text entities, but none is within the rectangle created by start and end
473 // so, no selection should be done
474 if (it == itEnd) {
475 return ret;
476 }
477 }
478 it = tmpIt;
479 bool selection_two_start = false;
480
481 // case 3.a
482 if (start == it) {
483 bool flagV = false;
484 NormalizedRect rect;
485
486 // selection type 01
487 if (startC.y <= endC.y) {
488 for (; it != itEnd; ++it) {
489 rect = (*it)->area;
490 rect.isBottom(startC) ? flagV = false : flagV = true;
491
492 if (flagV && rect.isRight(startC)) {
493 start = it;
494 break;
495 }
496 }
497 }
498
499 // selection type 02
500 else {
501 selection_two_start = true;
502 int distance = scaleX + scaleY + 100;
503 int count = 0;
504
505 for (; it != itEnd; ++it) {
506 rect = (*it)->area;
507
508 if (rect.isBottomOrLevel(startC) && rect.isRight(startC)) {
509 count++;
510 QRect entRect = rect.geometry(scaleX, scaleY);
511 int xdist, ydist;
512 xdist = entRect.center().x() - startC.x * scaleX;
513 ydist = entRect.center().y() - startC.y * scaleY;
514
515 // make them positive
516 if (xdist < 0)
517 xdist = -xdist;
518 if (ydist < 0)
519 ydist = -ydist;
520
521 if ((xdist + ydist) < distance) {
522 distance = xdist + ydist;
523 start = it;
524 }
525 }
526 }
527 }
528 }
529
530 // case 3.b
531 if (end == itEnd) {
532 it = tmpIt;
533 itEnd = itEnd - 1;
534
535 bool flagV = false;
536 NormalizedRect rect;
537
538 if (startC.y <= endC.y) {
539 for (; itEnd >= it; itEnd--) {
540 rect = (*itEnd)->area;
541 rect.isTop(endC) ? flagV = false : flagV = true;
542
543 if (flagV && rect.isLeft(endC)) {
544 end = itEnd;
545 break;
546 }
547 }
548 }
549
550 else {
551 int distance = scaleX + scaleY + 100;
552 for (; itEnd >= it; itEnd--) {
553 rect = (*itEnd)->area;
554
555 if (rect.isTopOrLevel(endC) && rect.isLeft(endC)) {
556 QRect entRect = rect.geometry(scaleX, scaleY);
557 int xdist, ydist;
558 xdist = entRect.center().x() - endC.x * scaleX;
559 ydist = entRect.center().y() - endC.y * scaleY;
560
561 // make them positive
562 if (xdist < 0)
563 xdist = -xdist;
564 if (ydist < 0)
565 ydist = -ydist;
566
567 if ((xdist + ydist) < distance) {
568 distance = xdist + ydist;
569 end = itEnd;
570 }
571 }
572 }
573 }
574 }
575
576 /* if start and end in selection 02 are in the same column, and we
577 start at an empty space we have to remove the selection of last
578 character
579 */
580 if (selection_two_start) {
581 if (start > end) {
582 start = start - 1;
583 }
584 }
585
586 // if start is less than end swap them
587 if (start > end) {
588 it = start;
589 start = end;
590 end = it;
591 }
592
593 // removes the possibility of crash, in case none of 1 to 3 is true
594 if (end == d->m_words.constEnd())
595 end--;
596
597 for (; start <= end; start++) {
598 ret->appendShape((*start)->transformedArea(matrix), side);
599 }
600
601 return ret;
602 }
603
findText(int searchID,const QString & query,SearchDirection direct,Qt::CaseSensitivity caseSensitivity,const RegularAreaRect * area)604 RegularAreaRect *TextPage::findText(int searchID, const QString &query, SearchDirection direct, Qt::CaseSensitivity caseSensitivity, const RegularAreaRect *area)
605 {
606 SearchDirection dir = direct;
607 // invalid search request
608 if (d->m_words.isEmpty() || query.isEmpty() || (area && area->isNull()))
609 return nullptr;
610 TextList::ConstIterator start;
611 int start_offset = 0;
612 TextList::ConstIterator end;
613 const QMap<int, SearchPoint *>::const_iterator sIt = d->m_searchPoints.constFind(searchID);
614 if (sIt == d->m_searchPoints.constEnd()) {
615 // if no previous run of this search is found, then set it to start
616 // from the beginning (respecting the search direction)
617 if (dir == NextResult)
618 dir = FromTop;
619 else if (dir == PreviousResult)
620 dir = FromBottom;
621 }
622 bool forward = true;
623 switch (dir) {
624 case FromTop:
625 start = d->m_words.constBegin();
626 start_offset = 0;
627 end = d->m_words.constEnd();
628 break;
629 case FromBottom:
630 start = d->m_words.constEnd();
631 start_offset = 0;
632 end = d->m_words.constBegin();
633 forward = false;
634 break;
635 case NextResult:
636 start = (*sIt)->it_end;
637 start_offset = (*sIt)->offset_end;
638 end = d->m_words.constEnd();
639 break;
640 case PreviousResult:
641 start = (*sIt)->it_begin;
642 start_offset = (*sIt)->offset_begin;
643 end = d->m_words.constBegin();
644 forward = false;
645 break;
646 };
647 RegularAreaRect *ret = nullptr;
648 const TextComparisonFunction cmpFn = caseSensitivity == Qt::CaseSensitive ? CaseSensitiveCmpFn : CaseInsensitiveCmpFn;
649 if (forward) {
650 ret = d->findTextInternalForward(searchID, query, cmpFn, start, start_offset, end);
651 } else {
652 ret = d->findTextInternalBackward(searchID, query, cmpFn, start, start_offset, end);
653 }
654 return ret;
655 }
656
657 // hyphenated '-' must be at the end of a word, so hyphenation means
658 // we have a '-' just followed by a '\n' character
659 // check if the string contains a '-' character
660 // if the '-' is the last entry
stringLengthAdaptedWithHyphen(const QString & str,const TextList::ConstIterator & it,const TextList::ConstIterator & textListEnd)661 static int stringLengthAdaptedWithHyphen(const QString &str, const TextList::ConstIterator &it, const TextList::ConstIterator &textListEnd)
662 {
663 const int len = str.length();
664
665 // hyphenated '-' must be at the end of a word, so hyphenation means
666 // we have a '-' just followed by a '\n' character
667 // check if the string contains a '-' character
668 // if the '-' is the last entry
669 if (str.endsWith(QLatin1Char('-'))) {
670 // validity chek of it + 1
671 if ((it + 1) != textListEnd) {
672 // 1. if the next character is '\n'
673 const QString &lookahedStr = (*(it + 1))->text();
674 if (lookahedStr.startsWith(QLatin1Char('\n'))) {
675 return len - 1;
676 }
677
678 // 2. if the next word is in a different line or not
679 const NormalizedRect &hyphenArea = (*it)->area;
680 const NormalizedRect &lookaheadArea = (*(it + 1))->area;
681
682 // lookahead to check whether both the '-' rect and next character rect overlap
683 if (!doesConsumeY(hyphenArea, lookaheadArea, 70)) {
684 return len - 1;
685 }
686 }
687 }
688 // else if it is the second last entry - for example in pdf format
689 else if (str.endsWith(QLatin1String("-\n"))) {
690 return len - 2;
691 }
692
693 return len;
694 }
695
searchPointToArea(const SearchPoint * sp)696 RegularAreaRect *TextPagePrivate::searchPointToArea(const SearchPoint *sp)
697 {
698 PagePrivate *pagePrivate = PagePrivate::get(m_page);
699 const QTransform matrix = pagePrivate ? pagePrivate->rotationMatrix() : QTransform();
700 RegularAreaRect *ret = new RegularAreaRect;
701
702 for (TextList::ConstIterator it = sp->it_begin;; it++) {
703 const TinyTextEntity *curEntity = *it;
704 ret->append(curEntity->transformedArea(matrix));
705
706 if (it == sp->it_end) {
707 break;
708 }
709 }
710
711 ret->simplify();
712 return ret;
713 }
714
findTextInternalForward(int searchID,const QString & _query,TextComparisonFunction comparer,const TextList::ConstIterator & start,int start_offset,const TextList::ConstIterator & end)715 RegularAreaRect *TextPagePrivate::findTextInternalForward(int searchID, const QString &_query, TextComparisonFunction comparer, const TextList::ConstIterator &start, int start_offset, const TextList::ConstIterator &end)
716 {
717 // normalize query search all unicode (including glyphs)
718 const QString query = _query.normalized(QString::NormalizationForm_KC);
719
720 // j is the current position in our query
721 // queryLeft is the length of the query we have left to match
722 int j = 0, queryLeft = query.length();
723
724 TextList::ConstIterator it = start;
725 int offset = start_offset;
726
727 TextList::ConstIterator it_begin = TextList::ConstIterator();
728 int offset_begin = 0; // dummy initial value to suppress compiler warnings
729
730 while (it != end) {
731 const TinyTextEntity *curEntity = *it;
732 const QString &str = curEntity->text();
733 const int strLen = str.length();
734 const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
735 // adjustedLen <= strLen
736
737 if (offset >= strLen) {
738 it++;
739 offset = 0;
740 continue;
741 }
742
743 if (it_begin == TextList::ConstIterator()) {
744 it_begin = it;
745 offset_begin = offset;
746 }
747
748 // Let the user write the hyphen or not when searching for text
749 int matchedLen = -1;
750 for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
751 // we have equal (or less than) area of the query left as the length of the current
752 // entity
753 const int min = qMin(queryLeft, matchingLen - offset);
754 if (comparer(str.midRef(offset, min), query.midRef(j, min))) {
755 matchedLen = min;
756 break;
757 }
758 }
759
760 if (matchedLen == -1) {
761 // we have not matched
762 // this means we do not have a complete match
763 // we need to get back to query start
764 // and continue the search from this place
765 #ifdef DEBUG_TEXTPAGE
766 qCDebug(OkularCoreDebug) << "\tnot matched";
767 #endif
768 j = 0;
769 queryLeft = query.length();
770 it = it_begin;
771 offset = offset_begin + 1;
772 it_begin = TextList::ConstIterator();
773 } else {
774 // we have a match
775 // move the current position in the query
776 // to the position after the length of this string
777 // we matched
778 // subtract the length of the current entity from
779 // the left length of the query
780 #ifdef DEBUG_TEXTPAGE
781 qCDebug(OkularCoreDebug) << "\tmatched" << matchedLen;
782 #endif
783 j += matchedLen;
784 queryLeft -= matchedLen;
785
786 if (queryLeft == 0) {
787 // save or update the search point for the current searchID
788 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
789 if (sIt == m_searchPoints.end()) {
790 sIt = m_searchPoints.insert(searchID, new SearchPoint);
791 }
792 SearchPoint *sp = *sIt;
793 sp->it_begin = it_begin;
794 sp->it_end = it;
795 sp->offset_begin = offset_begin;
796 sp->offset_end = offset + matchedLen;
797 return searchPointToArea(sp);
798 }
799
800 it++;
801 offset = 0;
802 }
803 }
804 // end of loop - it means that we've ended the textentities
805
806 const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
807 if (sIt != m_searchPoints.end()) {
808 SearchPoint *sp = *sIt;
809 m_searchPoints.erase(sIt);
810 delete sp;
811 }
812 return nullptr;
813 }
814
findTextInternalBackward(int searchID,const QString & _query,TextComparisonFunction comparer,const TextList::ConstIterator & start,int start_offset,const TextList::ConstIterator & end)815 RegularAreaRect *TextPagePrivate::findTextInternalBackward(int searchID, const QString &_query, TextComparisonFunction comparer, const TextList::ConstIterator &start, int start_offset, const TextList::ConstIterator &end)
816 {
817 // normalize query to search all unicode (including glyphs)
818 const QString query = _query.normalized(QString::NormalizationForm_KC);
819
820 // j is the current position in our query
821 // len is the length of the string in TextEntity
822 // queryLeft is the length of the query we have left
823 int j = query.length(), queryLeft = query.length();
824
825 TextList::ConstIterator it = start;
826 int offset = start_offset;
827
828 TextList::ConstIterator it_begin = TextList::ConstIterator();
829 int offset_begin = 0; // dummy initial value to suppress compiler warnings
830
831 while (true) {
832 if (offset <= 0) {
833 if (it == end) {
834 break;
835 }
836 it--;
837 }
838
839 const TinyTextEntity *curEntity = *it;
840 const QString &str = curEntity->text();
841 const int strLen = str.length();
842 const int adjustedLen = stringLengthAdaptedWithHyphen(str, it, m_words.constEnd());
843 // adjustedLen <= strLen
844
845 if (offset <= 0) {
846 offset = strLen;
847 }
848
849 if (it_begin == TextList::ConstIterator()) {
850 it_begin = it;
851 offset_begin = offset;
852 }
853
854 // Let the user write the hyphen or not when searching for text
855 int matchedLen = -1;
856 // we have equal (or less than) area of the query left as the length of the current
857 // entity
858 for (int matchingLen = strLen; matchingLen >= adjustedLen; matchingLen--) {
859 const int hyphenOffset = (strLen - matchingLen);
860 const int min = qMin(queryLeft + hyphenOffset, offset);
861 if (comparer(str.midRef(offset - min, min - hyphenOffset), query.midRef(j - min + hyphenOffset, min - hyphenOffset))) {
862 matchedLen = min - hyphenOffset;
863 break;
864 }
865 }
866
867 if (matchedLen == -1) {
868 // we have not matched
869 // this means we do not have a complete match
870 // we need to get back to query start
871 // and continue the search from this place
872 #ifdef DEBUG_TEXTPAGE
873 qCDebug(OkularCoreDebug) << "\tnot matched";
874 #endif
875
876 j = query.length();
877 queryLeft = query.length();
878 it = it_begin;
879 offset = offset_begin - 1;
880 it_begin = TextList::ConstIterator();
881 } else {
882 // we have a match
883 // move the current position in the query
884 // to the position after the length of this string
885 // we matched
886 // subtract the length of the current entity from
887 // the left length of the query
888 #ifdef DEBUG_TEXTPAGE
889 qCDebug(OkularCoreDebug) << "\tmatched";
890 #endif
891 j -= matchedLen;
892 queryLeft -= matchedLen;
893
894 if (queryLeft == 0) {
895 // save or update the search point for the current searchID
896 QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
897 if (sIt == m_searchPoints.end()) {
898 sIt = m_searchPoints.insert(searchID, new SearchPoint);
899 }
900 SearchPoint *sp = *sIt;
901 sp->it_begin = it;
902 sp->it_end = it_begin;
903 sp->offset_begin = offset - matchedLen;
904 sp->offset_end = offset_begin;
905 return searchPointToArea(sp);
906 }
907
908 offset = 0;
909 }
910 }
911 // end of loop - it means that we've ended the textentities
912
913 const QMap<int, SearchPoint *>::iterator sIt = m_searchPoints.find(searchID);
914 if (sIt != m_searchPoints.end()) {
915 SearchPoint *sp = *sIt;
916 m_searchPoints.erase(sIt);
917 delete sp;
918 }
919 return nullptr;
920 }
921
text(const RegularAreaRect * area) const922 QString TextPage::text(const RegularAreaRect *area) const
923 {
924 return text(area, AnyPixelTextAreaInclusionBehaviour);
925 }
926
text(const RegularAreaRect * area,TextAreaInclusionBehaviour b) const927 QString TextPage::text(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
928 {
929 if (area && area->isNull())
930 return QString();
931
932 TextList::ConstIterator it = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
933 QString ret;
934 if (area) {
935 for (; it != itEnd; ++it) {
936 if (b == AnyPixelTextAreaInclusionBehaviour) {
937 if (area->intersects((*it)->area)) {
938 ret += (*it)->text();
939 }
940 } else {
941 NormalizedPoint center = (*it)->area.center();
942 if (area->contains(center.x, center.y)) {
943 ret += (*it)->text();
944 }
945 }
946 }
947 } else {
948 for (; it != itEnd; ++it)
949 ret += (*it)->text();
950 }
951 return ret;
952 }
953
compareTinyTextEntityX(const WordWithCharacters & first,const WordWithCharacters & second)954 static bool compareTinyTextEntityX(const WordWithCharacters &first, const WordWithCharacters &second)
955 {
956 QRect firstArea = first.area().roundedGeometry(1000, 1000);
957 QRect secondArea = second.area().roundedGeometry(1000, 1000);
958
959 return firstArea.left() < secondArea.left();
960 }
961
compareTinyTextEntityY(const WordWithCharacters & first,const WordWithCharacters & second)962 static bool compareTinyTextEntityY(const WordWithCharacters &first, const WordWithCharacters &second)
963 {
964 const QRect firstArea = first.area().roundedGeometry(1000, 1000);
965 const QRect secondArea = second.area().roundedGeometry(1000, 1000);
966
967 return firstArea.top() < secondArea.top();
968 }
969
970 /**
971 * Sets a new world list. Deleting the contents of the old one
972 */
setWordList(const TextList & list)973 void TextPagePrivate::setWordList(const TextList &list)
974 {
975 qDeleteAll(m_words);
976 m_words = list;
977 }
978
979 /**
980 * Remove all the spaces in between texts. It will make all the generators
981 * same, whether they save spaces(like pdf) or not(like djvu).
982 */
removeSpace(TextList * words)983 static void removeSpace(TextList *words)
984 {
985 TextList::Iterator it = words->begin();
986 const QString str(QLatin1Char(' '));
987
988 while (it != words->end()) {
989 if ((*it)->text() == str) {
990 it = words->erase(it);
991 } else {
992 ++it;
993 }
994 }
995 }
996
997 /**
998 * We will read the TinyTextEntity from characters and try to create words from there.
999 * Note: characters might be already characters for some generators, but we will keep
1000 * the nomenclature characters for the generator produced data. The resulting
1001 * WordsWithCharacters memory has to be managed by the caller, both the
1002 * WordWithCharacters::word and WordWithCharacters::characters contents
1003 */
makeWordFromCharacters(const TextList & characters,int pageWidth,int pageHeight)1004 static WordsWithCharacters makeWordFromCharacters(const TextList &characters, int pageWidth, int pageHeight)
1005 {
1006 /**
1007 * We will traverse characters and try to create words from the TinyTextEntities in it.
1008 * We will search TinyTextEntity blocks and merge them until we get a
1009 * space between two consecutive TinyTextEntities. When we get a space
1010 * we can take it as a end of word. Then we store the word as a TinyTextEntity
1011 * and keep it in newList.
1012
1013 * We create a RegionText named regionWord that contains the word and the characters associated with it and
1014 * a rectangle area of the element in newList.
1015
1016 */
1017 WordsWithCharacters wordsWithCharacters;
1018
1019 TextList::ConstIterator it = characters.begin(), itEnd = characters.end(), tmpIt;
1020 int newLeft, newRight, newTop, newBottom;
1021 int index = 0;
1022
1023 for (; it != itEnd; it++) {
1024 QString textString = (*it)->text();
1025 QString newString;
1026 QRect lineArea = (*it)->area.roundedGeometry(pageWidth, pageHeight), elementArea;
1027 TextList wordCharacters;
1028 tmpIt = it;
1029 int space = 0;
1030
1031 while (!space) {
1032 if (!textString.isEmpty()) {
1033 newString.append(textString);
1034
1035 // when textString is the start of the word
1036 if (tmpIt == it) {
1037 NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1038 wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1039 } else {
1040 NormalizedRect newRect(elementArea, pageWidth, pageHeight);
1041 wordCharacters.append(new TinyTextEntity(textString.normalized(QString::NormalizationForm_KC), newRect));
1042 }
1043 }
1044
1045 ++it;
1046
1047 /*
1048 we must have to put this line before the if condition of it==itEnd
1049 otherwise the last character can be missed
1050 */
1051 if (it == itEnd)
1052 break;
1053 elementArea = (*it)->area.roundedGeometry(pageWidth, pageHeight);
1054 if (!doesConsumeY(elementArea, lineArea, 60)) {
1055 --it;
1056 break;
1057 }
1058
1059 const int text_y1 = elementArea.top(), text_x1 = elementArea.left(), text_y2 = elementArea.y() + elementArea.height(), text_x2 = elementArea.x() + elementArea.width();
1060 const int line_y1 = lineArea.top(), line_x1 = lineArea.left(), line_y2 = lineArea.y() + lineArea.height(), line_x2 = lineArea.x() + lineArea.width();
1061
1062 space = elementArea.left() - lineArea.right();
1063
1064 if (space != 0) {
1065 it--;
1066 break;
1067 }
1068
1069 newLeft = text_x1 < line_x1 ? text_x1 : line_x1;
1070 newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1071 newTop = text_y1 > line_y1 ? line_y1 : text_y1;
1072 newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1073
1074 lineArea.setLeft(newLeft);
1075 lineArea.setTop(newTop);
1076 lineArea.setWidth(newRight - newLeft);
1077 lineArea.setHeight(newBottom - newTop);
1078
1079 textString = (*it)->text();
1080 }
1081
1082 // if newString is not empty, save it
1083 if (!newString.isEmpty()) {
1084 const NormalizedRect newRect(lineArea, pageWidth, pageHeight);
1085 TinyTextEntity *word = new TinyTextEntity(newString.normalized(QString::NormalizationForm_KC), newRect);
1086 wordsWithCharacters.append(WordWithCharacters(word, wordCharacters));
1087
1088 index++;
1089 }
1090
1091 if (it == itEnd)
1092 break;
1093 }
1094
1095 return wordsWithCharacters;
1096 }
1097
1098 /**
1099 * Create Lines from the words and sort them
1100 */
makeAndSortLines(const WordsWithCharacters & wordsTmp,int pageWidth,int pageHeight)1101 QList<QPair<WordsWithCharacters, QRect>> makeAndSortLines(const WordsWithCharacters &wordsTmp, int pageWidth, int pageHeight)
1102 {
1103 /**
1104 * We cannot assume that the generator will give us texts in the right order.
1105 * We can only assume that we will get texts in the page and their bounding
1106 * rectangle. The texts can be character, word, half-word anything.
1107 * So, we need to:
1108 **
1109 * 1. Sort rectangles/boxes containing texts by y0(top)
1110 * 2. Create textline where there is y overlap between TinyTextEntity 's
1111 * 3. Within each line sort the TinyTextEntity 's by x0(left)
1112 */
1113
1114 QList<QPair<WordsWithCharacters, QRect>> lines;
1115
1116 /*
1117 Make a new copy of the TextList in the words, so that the wordsTmp and lines do
1118 not contain same pointers for all the TinyTextEntity.
1119 */
1120 QList<WordWithCharacters> words = wordsTmp;
1121
1122 // Step 1
1123 std::sort(words.begin(), words.end(), compareTinyTextEntityY);
1124
1125 // Step 2
1126 QList<WordWithCharacters>::Iterator it = words.begin(), itEnd = words.end();
1127
1128 // for every non-space texts(characters/words) in the textList
1129 for (; it != itEnd; it++) {
1130 const QRect elementArea = (*it).area().roundedGeometry(pageWidth, pageHeight);
1131 bool found = false;
1132
1133 for (QPair<WordsWithCharacters, QRect> &linesI : lines) {
1134 /* the line area which will be expanded
1135 line_rects is only necessary to preserve the topmin and bottommax of all
1136 the texts in the line, left and right is not necessary at all
1137 */
1138 QRect &lineArea = linesI.second;
1139 const int text_y1 = elementArea.top(), text_y2 = elementArea.top() + elementArea.height(), text_x1 = elementArea.left(), text_x2 = elementArea.left() + elementArea.width();
1140 const int line_y1 = lineArea.top(), line_y2 = lineArea.top() + lineArea.height(), line_x1 = lineArea.left(), line_x2 = lineArea.left() + lineArea.width();
1141
1142 /*
1143 if the new text and the line has y overlapping parts of more than 70%,
1144 the text will be added to this line
1145 */
1146 if (doesConsumeY(elementArea, lineArea, 70)) {
1147 WordsWithCharacters &line = linesI.first;
1148 line.append(*it);
1149
1150 const int newLeft = line_x1 < text_x1 ? line_x1 : text_x1;
1151 const int newRight = line_x2 > text_x2 ? line_x2 : text_x2;
1152 const int newTop = line_y1 < text_y1 ? line_y1 : text_y1;
1153 const int newBottom = text_y2 > line_y2 ? text_y2 : line_y2;
1154
1155 lineArea = QRect(newLeft, newTop, newRight - newLeft, newBottom - newTop);
1156 found = true;
1157 }
1158
1159 if (found)
1160 break;
1161 }
1162
1163 /* when we have found a new line create a new TextList containing
1164 only one element and append it to the lines
1165 */
1166 if (!found) {
1167 WordsWithCharacters tmp;
1168 tmp.append((*it));
1169 lines.append(QPair<WordsWithCharacters, QRect>(tmp, elementArea));
1170 }
1171 }
1172
1173 // Step 3
1174 for (QPair<WordsWithCharacters, QRect> &line : lines) {
1175 WordsWithCharacters &list = line.first;
1176 std::sort(list.begin(), list.end(), compareTinyTextEntityX);
1177 }
1178
1179 return lines;
1180 }
1181
1182 /**
1183 * Calculate Statistical information from the lines we made previously
1184 */
calculateStatisticalInformation(const QList<WordWithCharacters> & words,int pageWidth,int pageHeight,int * word_spacing,int * line_spacing,int * col_spacing)1185 static void calculateStatisticalInformation(const QList<WordWithCharacters> &words, int pageWidth, int pageHeight, int *word_spacing, int *line_spacing, int *col_spacing)
1186 {
1187 /**
1188 * For the region, defined by line_rects and lines
1189 * 1. Make line statistical analysis to find the line spacing
1190 * 2. Make character statistical analysis to differentiate between
1191 * word spacing and column spacing.
1192 */
1193
1194 /**
1195 * Step 0
1196 */
1197 const QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(words, pageWidth, pageHeight);
1198
1199 /**
1200 * Step 1
1201 */
1202 QMap<int, int> line_space_stat;
1203 for (int i = 0; i < sortedLines.length(); i++) {
1204 const QRect rectUpper = sortedLines.at(i).second;
1205
1206 if (i + 1 == sortedLines.length())
1207 break;
1208 const QRect rectLower = sortedLines.at(i + 1).second;
1209
1210 int linespace = rectLower.top() - (rectUpper.top() + rectUpper.height());
1211 if (linespace < 0)
1212 linespace = -linespace;
1213
1214 if (line_space_stat.contains(linespace))
1215 line_space_stat[linespace]++;
1216 else
1217 line_space_stat[linespace] = 1;
1218 }
1219
1220 *line_spacing = 0;
1221 int weighted_count = 0;
1222 QMapIterator<int, int> iterate_linespace(line_space_stat);
1223
1224 while (iterate_linespace.hasNext()) {
1225 iterate_linespace.next();
1226 *line_spacing += iterate_linespace.value() * iterate_linespace.key();
1227 weighted_count += iterate_linespace.value();
1228 }
1229 if (*line_spacing != 0)
1230 *line_spacing = (int)((double)*line_spacing / (double)weighted_count + 0.5);
1231
1232 /**
1233 * Step 2
1234 */
1235 // We would like to use QMap instead of QHash as it will keep the keys sorted
1236 QMap<int, int> hor_space_stat;
1237 QMap<int, int> col_space_stat;
1238 QList<QList<QRect>> space_rects;
1239 QVector<QRect> max_hor_space_rects;
1240
1241 // Space in every line
1242 for (const QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1243 const WordsWithCharacters list = sortedLine.first;
1244 QList<QRect> line_space_rects;
1245 int maxSpace = 0, minSpace = pageWidth;
1246
1247 // for every TinyTextEntity element in the line
1248 WordsWithCharacters::ConstIterator it = list.begin(), itEnd = list.end();
1249 QRect max_area1, max_area2;
1250 QString before_max, after_max;
1251
1252 // for every line
1253 for (; it != itEnd; it++) {
1254 const QRect area1 = (*it).area().roundedGeometry(pageWidth, pageHeight);
1255 if (it + 1 == itEnd)
1256 break;
1257
1258 const QRect area2 = (*(it + 1)).area().roundedGeometry(pageWidth, pageHeight);
1259 int space = area2.left() - area1.right();
1260
1261 if (space > maxSpace) {
1262 max_area1 = area1;
1263 max_area2 = area2;
1264 maxSpace = space;
1265 before_max = (*it).text();
1266 after_max = (*(it + 1)).text();
1267 }
1268
1269 if (space < minSpace && space != 0)
1270 minSpace = space;
1271
1272 // if we found a real space, whose length is not zero and also less than the pageWidth
1273 if (space != 0 && space != pageWidth) {
1274 // increase the count of the space amount
1275 if (hor_space_stat.contains(space))
1276 hor_space_stat[space]++;
1277 else
1278 hor_space_stat[space] = 1;
1279
1280 int left, right, top, bottom;
1281
1282 left = area1.right();
1283 right = area2.left();
1284
1285 top = area2.top() < area1.top() ? area2.top() : area1.top();
1286 bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1287
1288 QRect rect(left, top, right - left, bottom - top);
1289 line_space_rects.append(rect);
1290 }
1291 }
1292
1293 space_rects.append(line_space_rects);
1294
1295 if (hor_space_stat.contains(maxSpace)) {
1296 if (hor_space_stat[maxSpace] != 1)
1297 hor_space_stat[maxSpace]--;
1298 else
1299 hor_space_stat.remove(maxSpace);
1300 }
1301
1302 if (maxSpace != 0) {
1303 if (col_space_stat.contains(maxSpace))
1304 col_space_stat[maxSpace]++;
1305 else
1306 col_space_stat[maxSpace] = 1;
1307
1308 // store the max rect of each line
1309 const int left = max_area1.right();
1310 const int right = max_area2.left();
1311 const int top = (max_area1.top() > max_area2.top()) ? max_area2.top() : max_area1.top();
1312 const int bottom = (max_area1.bottom() < max_area2.bottom()) ? max_area2.bottom() : max_area1.bottom();
1313
1314 const QRect rect(left, top, right - left, bottom - top);
1315 max_hor_space_rects.append(rect);
1316 } else
1317 max_hor_space_rects.append(QRect(0, 0, 0, 0));
1318 }
1319
1320 // All the between word space counts are in hor_space_stat
1321 *word_spacing = 0;
1322 weighted_count = 0;
1323 QMapIterator<int, int> iterate(hor_space_stat);
1324
1325 while (iterate.hasNext()) {
1326 iterate.next();
1327
1328 if (iterate.key() > 0) {
1329 *word_spacing += iterate.value() * iterate.key();
1330 weighted_count += iterate.value();
1331 }
1332 }
1333 if (weighted_count)
1334 *word_spacing = (int)((double)*word_spacing / (double)weighted_count + 0.5);
1335
1336 *col_spacing = 0;
1337 QMapIterator<int, int> iterate_col(col_space_stat);
1338
1339 while (iterate_col.hasNext()) {
1340 iterate_col.next();
1341 if (iterate_col.value() > *col_spacing)
1342 *col_spacing = iterate_col.value();
1343 }
1344 *col_spacing = col_space_stat.key(*col_spacing);
1345
1346 // if there is just one line in a region, there is no point in dividing it
1347 if (sortedLines.length() == 1)
1348 *word_spacing = *col_spacing;
1349 }
1350
1351 /**
1352 * Implements the XY Cut algorithm for textpage segmentation
1353 * The resulting RegionTextList will contain RegionText whose WordsWithCharacters::word and
1354 * WordsWithCharacters::characters are reused from wordsWithCharacters (i.e. no new nor delete happens in this function)
1355 */
XYCutForBoundingBoxes(const QList<WordWithCharacters> & wordsWithCharacters,int pageWidth,int pageHeight)1356 static RegionTextList XYCutForBoundingBoxes(const QList<WordWithCharacters> &wordsWithCharacters, int pageWidth, int pageHeight)
1357 {
1358 RegionTextList tree;
1359 QRect contentRect(0, 0, pageWidth, pageHeight);
1360 const RegionText root(wordsWithCharacters, contentRect);
1361
1362 // start the tree with the root, it is our only region at the start
1363 tree.push_back(root);
1364
1365 int i = 0;
1366
1367 // while traversing the tree has not been ended
1368 while (i < tree.length()) {
1369 const RegionText node = tree.at(i);
1370 QRect regionRect = node.area();
1371
1372 /**
1373 * 1. calculation of projection profiles
1374 */
1375 // allocate the size of proj profiles and initialize with 0
1376 int size_proj_y = node.area().height();
1377 int size_proj_x = node.area().width();
1378 // dynamic memory allocation
1379 QVarLengthArray<int> proj_on_xaxis(size_proj_x);
1380 QVarLengthArray<int> proj_on_yaxis(size_proj_y);
1381
1382 for (int j = 0; j < size_proj_y; ++j)
1383 proj_on_yaxis[j] = 0;
1384 for (int j = 0; j < size_proj_x; ++j)
1385 proj_on_xaxis[j] = 0;
1386
1387 const QList<WordWithCharacters> list = node.text();
1388
1389 // Calculate tcx and tcy locally for each new region
1390 int word_spacing, line_spacing, column_spacing;
1391 calculateStatisticalInformation(list, pageWidth, pageHeight, &word_spacing, &line_spacing, &column_spacing);
1392
1393 const int tcx = word_spacing * 2;
1394 const int tcy = line_spacing * 2;
1395
1396 int maxX = 0, maxY = 0;
1397 int avgX = 0;
1398 int count;
1399
1400 // for every text in the region
1401 for (const WordWithCharacters &wwc : list) {
1402 TinyTextEntity *ent = wwc.word;
1403 const QRect entRect = ent->area.geometry(pageWidth, pageHeight);
1404
1405 // calculate vertical projection profile proj_on_xaxis1
1406 for (int k = entRect.left(); k <= entRect.left() + entRect.width(); ++k) {
1407 if ((k - regionRect.left()) < size_proj_x && (k - regionRect.left()) >= 0)
1408 proj_on_xaxis[k - regionRect.left()] += entRect.height();
1409 }
1410
1411 // calculate horizontal projection profile in the same way
1412 for (int k = entRect.top(); k <= entRect.top() + entRect.height(); ++k) {
1413 if ((k - regionRect.top()) < size_proj_y && (k - regionRect.top()) >= 0)
1414 proj_on_yaxis[k - regionRect.top()] += entRect.width();
1415 }
1416 }
1417
1418 for (int j = 0; j < size_proj_y; ++j) {
1419 if (proj_on_yaxis[j] > maxY)
1420 maxY = proj_on_yaxis[j];
1421 }
1422
1423 avgX = count = 0;
1424 for (int j = 0; j < size_proj_x; ++j) {
1425 if (proj_on_xaxis[j] > maxX)
1426 maxX = proj_on_xaxis[j];
1427 if (proj_on_xaxis[j]) {
1428 count++;
1429 avgX += proj_on_xaxis[j];
1430 }
1431 }
1432 if (count)
1433 avgX /= count;
1434
1435 /**
1436 * 2. Cleanup Boundary White Spaces and removal of noise
1437 */
1438 int xbegin = 0, xend = size_proj_x - 1;
1439 int ybegin = 0, yend = size_proj_y - 1;
1440 while (xbegin < size_proj_x && proj_on_xaxis[xbegin] <= 0)
1441 xbegin++;
1442 while (xend >= 0 && proj_on_xaxis[xend] <= 0)
1443 xend--;
1444 while (ybegin < size_proj_y && proj_on_yaxis[ybegin] <= 0)
1445 ybegin++;
1446 while (yend >= 0 && proj_on_yaxis[yend] <= 0)
1447 yend--;
1448
1449 // update the regionRect
1450 int old_left = regionRect.left(), old_top = regionRect.top();
1451 regionRect.setLeft(old_left + xbegin);
1452 regionRect.setRight(old_left + xend);
1453 regionRect.setTop(old_top + ybegin);
1454 regionRect.setBottom(old_top + yend);
1455
1456 int tnx = (int)((double)avgX * 10.0 / 100.0 + 0.5), tny = 0;
1457 for (int j = 0; j < size_proj_x; ++j)
1458 proj_on_xaxis[j] -= tnx;
1459 for (int j = 0; j < size_proj_y; ++j)
1460 proj_on_yaxis[j] -= tny;
1461
1462 /**
1463 * 3. Find the Widest gap
1464 */
1465 int gap_hor = -1, pos_hor = -1;
1466 int begin = -1, end = -1;
1467
1468 // find all hor_gaps and find the maximum between them
1469 for (int j = 1; j < size_proj_y; ++j) {
1470 // transition from white to black
1471 if (begin >= 0 && proj_on_yaxis[j - 1] <= 0 && proj_on_yaxis[j] > 0)
1472 end = j;
1473
1474 // transition from black to white
1475 if (proj_on_yaxis[j - 1] > 0 && proj_on_yaxis[j] <= 0)
1476 begin = j;
1477
1478 if (begin > 0 && end > 0 && end - begin > gap_hor) {
1479 gap_hor = end - begin;
1480 pos_hor = (end + begin) / 2;
1481 begin = -1;
1482 end = -1;
1483 }
1484 }
1485
1486 begin = -1, end = -1;
1487 int gap_ver = -1, pos_ver = -1;
1488
1489 // find all the ver_gaps and find the maximum between them
1490 for (int j = 1; j < size_proj_x; ++j) {
1491 // transition from white to black
1492 if (begin >= 0 && proj_on_xaxis[j - 1] <= 0 && proj_on_xaxis[j] > 0) {
1493 end = j;
1494 }
1495
1496 // transition from black to white
1497 if (proj_on_xaxis[j - 1] > 0 && proj_on_xaxis[j] <= 0)
1498 begin = j;
1499
1500 if (begin > 0 && end > 0 && end - begin > gap_ver) {
1501 gap_ver = end - begin;
1502 pos_ver = (end + begin) / 2;
1503 begin = -1;
1504 end = -1;
1505 }
1506 }
1507
1508 int cut_pos_x = pos_ver, cut_pos_y = pos_hor;
1509 int gap_x = gap_ver, gap_y = gap_hor;
1510
1511 /**
1512 * 4. Cut the region and make nodes (left,right) or (up,down)
1513 */
1514 bool cut_hor = false, cut_ver = false;
1515
1516 // For horizontal cut
1517 const int topHeight = cut_pos_y - (regionRect.top() - old_top);
1518 const QRect topRect(regionRect.left(), regionRect.top(), regionRect.width(), topHeight);
1519 const QRect bottomRect(regionRect.left(), regionRect.top() + topHeight, regionRect.width(), regionRect.height() - topHeight);
1520
1521 // For vertical Cut
1522 const int leftWidth = cut_pos_x - (regionRect.left() - old_left);
1523 const QRect leftRect(regionRect.left(), regionRect.top(), leftWidth, regionRect.height());
1524 const QRect rightRect(regionRect.left() + leftWidth, regionRect.top(), regionRect.width() - leftWidth, regionRect.height());
1525
1526 if (gap_y >= gap_x && gap_y >= tcy)
1527 cut_hor = true;
1528 else if (gap_y >= gap_x && gap_y <= tcy && gap_x >= tcx)
1529 cut_ver = true;
1530 else if (gap_x >= gap_y && gap_x >= tcx)
1531 cut_ver = true;
1532 else if (gap_x >= gap_y && gap_x <= tcx && gap_y >= tcy)
1533 cut_hor = true;
1534 // no cut possible
1535 else {
1536 // we can now update the node rectangle with the shrinked rectangle
1537 RegionText tmpNode = tree.at(i);
1538 tmpNode.setArea(regionRect);
1539 tree.replace(i, tmpNode);
1540 i++;
1541 continue;
1542 }
1543
1544 WordsWithCharacters list1, list2;
1545
1546 // horizontal cut, topRect and bottomRect
1547 if (cut_hor) {
1548 for (const WordWithCharacters &word : list) {
1549 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1550
1551 if (topRect.intersects(wordRect))
1552 list1.append(word);
1553 else
1554 list2.append(word);
1555 }
1556
1557 RegionText node1(list1, topRect);
1558 RegionText node2(list2, bottomRect);
1559
1560 tree.replace(i, node1);
1561 tree.insert(i + 1, node2);
1562 }
1563
1564 // vertical cut, leftRect and rightRect
1565 else if (cut_ver) {
1566 for (const WordWithCharacters &word : list) {
1567 const QRect wordRect = word.area().geometry(pageWidth, pageHeight);
1568
1569 if (leftRect.intersects(wordRect))
1570 list1.append(word);
1571 else
1572 list2.append(word);
1573 }
1574
1575 RegionText node1(list1, leftRect);
1576 RegionText node2(list2, rightRect);
1577
1578 tree.replace(i, node1);
1579 tree.insert(i + 1, node2);
1580 }
1581 }
1582
1583 return tree;
1584 }
1585
1586 /**
1587 * Add spaces in between words in a line. It reuses the pointers passed in tree and might add new ones. You will need to take care of deleting them if needed
1588 */
addNecessarySpace(RegionTextList tree,int pageWidth,int pageHeight)1589 WordsWithCharacters addNecessarySpace(RegionTextList tree, int pageWidth, int pageHeight)
1590 {
1591 /**
1592 * 1. Call makeAndSortLines before adding spaces in between words in a line
1593 * 2. Now add spaces between every two words in a line
1594 * 3. Finally, extract all the space separated texts from each region and return it
1595 */
1596
1597 // Only change the texts under RegionTexts, not the area
1598 for (RegionText &tmpRegion : tree) {
1599 // Step 01
1600 QList<QPair<WordsWithCharacters, QRect>> sortedLines = makeAndSortLines(tmpRegion.text(), pageWidth, pageHeight);
1601
1602 // Step 02
1603 for (QPair<WordsWithCharacters, QRect> &sortedLine : sortedLines) {
1604 WordsWithCharacters &list = sortedLine.first;
1605 for (int k = 0; k < list.length(); k++) {
1606 const QRect area1 = list.at(k).area().roundedGeometry(pageWidth, pageHeight);
1607 if (k + 1 >= list.length())
1608 break;
1609
1610 const QRect area2 = list.at(k + 1).area().roundedGeometry(pageWidth, pageHeight);
1611 const int space = area2.left() - area1.right();
1612
1613 if (space != 0) {
1614 // Make a TinyTextEntity of string space and push it between it and it+1
1615 const int left = area1.right();
1616 const int right = area2.left();
1617 const int top = area2.top() < area1.top() ? area2.top() : area1.top();
1618 const int bottom = area2.bottom() > area1.bottom() ? area2.bottom() : area1.bottom();
1619
1620 const QString spaceStr(QStringLiteral(" "));
1621 const QRect rect(QPoint(left, top), QPoint(right, bottom));
1622 const NormalizedRect entRect(rect, pageWidth, pageHeight);
1623 TinyTextEntity *ent1 = new TinyTextEntity(spaceStr, entRect);
1624 TinyTextEntity *ent2 = new TinyTextEntity(spaceStr, entRect);
1625 WordWithCharacters word(ent1, QList<TinyTextEntity *>() << ent2);
1626
1627 list.insert(k + 1, word);
1628
1629 // Skip the space
1630 k++;
1631 }
1632 }
1633 }
1634
1635 WordsWithCharacters tmpList;
1636 for (const QPair<WordsWithCharacters, QRect> &sortedLine : qAsConst(sortedLines)) {
1637 tmpList += sortedLine.first;
1638 }
1639 tmpRegion.setText(tmpList);
1640 }
1641
1642 // Step 03
1643 WordsWithCharacters tmp;
1644 for (const RegionText &tmpRegion : qAsConst(tree)) {
1645 tmp += tmpRegion.text();
1646 }
1647 return tmp;
1648 }
1649
1650 /**
1651 * Correct the textOrder, all layout recognition works here
1652 */
correctTextOrder()1653 void TextPagePrivate::correctTextOrder()
1654 {
1655 // m_page->width() and m_page->height() are in pixels at
1656 // 100% zoom level, and thus depend on display DPI.
1657 // To avoid Okular failing on lowDPI displays,
1658 // we scale pageWidth and pageHeight so their sum equals 2000.
1659 const double scalingFactor = 2000.0 / (m_page->width() + m_page->height());
1660 const int pageWidth = (int)(scalingFactor * m_page->width());
1661 const int pageHeight = (int)(scalingFactor * m_page->height());
1662
1663 TextList characters = m_words;
1664
1665 /**
1666 * Remove spaces from the text
1667 */
1668 removeSpace(&characters);
1669
1670 /**
1671 * Construct words from characters
1672 */
1673 const QList<WordWithCharacters> wordsWithCharacters = makeWordFromCharacters(characters, pageWidth, pageHeight);
1674
1675 /**
1676 * Make a XY Cut tree for segmentation of the texts
1677 */
1678 const RegionTextList tree = XYCutForBoundingBoxes(wordsWithCharacters, pageWidth, pageHeight);
1679
1680 /**
1681 * Add spaces to the word
1682 */
1683 const WordsWithCharacters listWithWordsAndSpaces = addNecessarySpace(tree, pageWidth, pageHeight);
1684
1685 /**
1686 * Break the words into characters
1687 */
1688 TextList listOfCharacters;
1689 for (const WordWithCharacters &word : listWithWordsAndSpaces) {
1690 delete word.word;
1691 listOfCharacters.append(word.characters);
1692 }
1693 setWordList(listOfCharacters);
1694 }
1695
words(const RegularAreaRect * area,TextAreaInclusionBehaviour b) const1696 TextEntity::List TextPage::words(const RegularAreaRect *area, TextAreaInclusionBehaviour b) const
1697 {
1698 if (area && area->isNull())
1699 return TextEntity::List();
1700
1701 TextEntity::List ret;
1702 if (area) {
1703 for (const TinyTextEntity *te : qAsConst(d->m_words)) {
1704 if (b == AnyPixelTextAreaInclusionBehaviour) {
1705 if (area->intersects(te->area)) {
1706 ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1707 }
1708 } else {
1709 const NormalizedPoint center = te->area.center();
1710 if (area->contains(center.x, center.y)) {
1711 ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1712 }
1713 }
1714 }
1715 } else {
1716 for (const TinyTextEntity *te : qAsConst(d->m_words)) {
1717 ret.append(new TextEntity(te->text(), new Okular::NormalizedRect(te->area)));
1718 }
1719 }
1720 return ret;
1721 }
1722
wordAt(const NormalizedPoint & p,QString * word) const1723 RegularAreaRect *TextPage::wordAt(const NormalizedPoint &p, QString *word) const
1724 {
1725 TextList::ConstIterator itBegin = d->m_words.constBegin(), itEnd = d->m_words.constEnd();
1726 TextList::ConstIterator it = itBegin;
1727 TextList::ConstIterator posIt = itEnd;
1728 for (; it != itEnd; ++it) {
1729 if ((*it)->area.contains(p.x, p.y)) {
1730 posIt = it;
1731 break;
1732 }
1733 }
1734 QString text;
1735 if (posIt != itEnd) {
1736 if ((*posIt)->text().simplified().isEmpty()) {
1737 return nullptr;
1738 }
1739 // Find the first TinyTextEntity of the word
1740 while (posIt != itBegin) {
1741 --posIt;
1742 const QString itText = (*posIt)->text();
1743 if (itText.right(1).at(0).isSpace()) {
1744 if (itText.endsWith(QLatin1String("-\n"))) {
1745 // Is an hyphenated word
1746 // continue searching the start of the word back
1747 continue;
1748 }
1749
1750 if (itText == QLatin1String("\n") && posIt != itBegin) {
1751 --posIt;
1752 if ((*posIt)->text().endsWith(QLatin1String("-"))) {
1753 // Is an hyphenated word
1754 // continue searching the start of the word back
1755 continue;
1756 }
1757 ++posIt;
1758 }
1759
1760 ++posIt;
1761 break;
1762 }
1763 }
1764 RegularAreaRect *ret = new RegularAreaRect();
1765 for (; posIt != itEnd; ++posIt) {
1766 const QString itText = (*posIt)->text();
1767 if (itText.simplified().isEmpty()) {
1768 break;
1769 }
1770
1771 ret->appendShape((*posIt)->area);
1772 text += (*posIt)->text();
1773 if (itText.right(1).at(0).isSpace()) {
1774 if (!text.endsWith(QLatin1String("-\n"))) {
1775 break;
1776 }
1777 }
1778 }
1779
1780 if (word) {
1781 *word = text;
1782 }
1783 return ret;
1784 } else {
1785 return nullptr;
1786 }
1787 }
1788