1 /*
2     Copyright © 2008-13 Qtrac Ltd. All rights reserved.
3     This program or module is free software: you can redistribute it
4     and/or modify it under the terms of the GNU General Public License
5     as published by the Free Software Foundation, either version 2 of
6     the License, or (at your option) any later version. This program is
7     distributed in the hope that it will be useful, but WITHOUT ANY
8     WARRANTY; without even the implied warranty of MERCHANTABILITY or
9     FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
10     for more details.
11 */
12 
13 #include "textitem.hpp"
14 
15 #include <QDir>
16 #include <QFile>
17 #include <QTextStream>
18 
19 
texts() const20 QStringList TextItems::texts() const
21 {
22     QStringList list;
23     foreach (const TextItem &item, items)
24         list << item.text;
25     return list;
26 }
27 
28 
rects() const29 QList<QRectF> TextItems::rects() const
30 {
31     QList<QRectF> list;
32     foreach (const TextItem &item, items)
33         list << item.rect;
34     return list;
35 }
36 
37 struct Key
38 {
KeyKey39     Key(const int a, const int b, const int c) : a(a), b(b), c(c) {}
40 
operator <Key41     bool operator<(const Key &other) const {
42         if (a != other.a) return a < other.a;
43         if (b != other.b) return b < other.b;
44         return c < other.c;
45     }
46 
47     int a, b, c;
48 };
49 
50 
columnZoneYxOrder(const int Width,const int ToleranceR,const int ToleranceY,const int Columns)51 void TextItems::columnZoneYxOrder(const int Width, const int ToleranceR,
52         const int ToleranceY, const int Columns)
53 {
54     // Phase #1: Sort all the texts into (column, y, x) order
55     columnYxOrder(Width, ToleranceY, Columns);
56     // Phase #2: Sort all the texts into (column, zone, y, x) order
57     QList<QPainterPath> zones = generateZones(Width, ToleranceR,
58                                               ToleranceY, Columns);
59     QMap<Key, TextItem> itemForZoneYx;
60     foreach (const TextItem &item, items) {
61         const QRectF rect = item.rect.adjusted(-ToleranceR, -ToleranceR,
62                                                ToleranceR, ToleranceR);
63         const int y = normalizedY(static_cast<int>(item.rect.y()),
64                                   ToleranceY);
65         for (int i = 0; i < zones.count(); ++i) {
66             if (zones.at(i).intersects(rect)) {
67                 itemForZoneYx.insert(Key(i, y, item.rect.x()), item);
68                 break;
69             }
70         }
71     }
72     items = itemForZoneYx.values();
73 }
74 
75 
columnYxOrder(const int Width,const int ToleranceY,const int Columns)76 void TextItems::columnYxOrder(const int Width, const int ToleranceY,
77                               const int Columns)
78 {
79     // Phase #1: Sort all the texts into (column, y, x) order
80     const int Span = Width / Columns;
81     QMap<Key, TextItem> itemForColumnYx;
82     foreach (const TextItem &item, items) {
83         const QRect &rect = item.toRect();
84         const int Column = ((Columns == 1) ? 0
85             : (rect.width() > Span) ? Columns : rect.right() / Span);
86         const int y = normalizedY(static_cast<int>(rect.y()), ToleranceY);
87         itemForColumnYx.insert(Key(Column, y, rect.x()), item);
88     }
89     items = itemForColumnYx.values();
90 }
91 
92 
generateZones(const int Width,const int ToleranceR,const int ToleranceY,const int Columns) const93 const QList<QPainterPath> TextItems::generateZones(const int Width,
94         const int ToleranceR, const int ToleranceY,
95         const int Columns) const
96 { // Assumes that items are already in column, y, x order!
97     // Phase #1: Generate the zones
98     QList<QPainterPath> zones;
99     foreach (const TextItem &item, items) {
100         if (zones.isEmpty()) { // First word becomes first zone
101             QPainterPath zone;
102             zone.addRect(item.rect);
103             zones << zone;
104         } else { // Add to an existing zone within tolerance or a new one
105             const QRectF tolerantRect = item.rect.adjusted(-ToleranceR,
106                     -ToleranceR, ToleranceR, ToleranceR);
107             bool found = false;
108             for (int i = 0; i < zones.count(); ++i) {
109                 QPainterPath zone = zones.at(i);
110                 if (zone.intersects(tolerantRect)) {
111                     zone.addRect(item.rect);
112                     zones[i] = zone;
113                     found = true;
114                     break;
115                 }
116             }
117             if (!found) {
118                 QPainterPath zone;
119                 zone.addRect(item.rect);
120                 zones << zone;
121             }
122         }
123     }
124 
125     // Phase #2: Order the zones by (column, y, x)
126     const int Span = Width / Columns;
127     QMultiMap<Key, QPainterPath> zonesForColumn;
128     foreach (const QPainterPath &zone, zones) {
129         const QRect &rect = zone.boundingRect().toRect();
130         const int Column = ((Columns == 1) ? 0
131             : (rect.width() > Span) ? Columns : rect.right() / Span);
132         const int y = normalizedY(static_cast<int>(rect.y()), ToleranceY);
133         zonesForColumn.insertMulti(Key(Column, y, rect.x()), zone);
134     }
135     return zonesForColumn.values();
136 }
137 
138 
debug(const int page,const int ToleranceY,const bool ComparingWords,const bool Yx)139 void TextItems::debug(const int page, const int ToleranceY,
140         const bool ComparingWords, const bool Yx)
141 {
142     QString filename = QDir::tempPath() + QString("/page1-%1.txt")
143         .arg(page);
144     int count = 1;
145     while (QFile::exists(filename)) {
146         filename = QDir::tempPath() + QString("/page%2-%1.txt").arg(page)
147                                               .arg(++count);
148     }
149     QFile file(filename);
150     if (!file.open(QIODevice::WriteOnly|QIODevice::Text)) {
151         qDebug("%s", qPrintable(QString("cannot write debug file: %1")
152                                 .arg(file.errorString())));
153     }
154     QTextStream out(&file);
155     out.setCodec("UTF-8");
156     out << "Page #" << page << ": "
157         << (ComparingWords ? "Words" : "Characters") << " mode\n";
158     for (int i = 0; i < items.count(); ++i) {
159         const TextItem &item = items.at(i);
160         const QRect rect = item.toRect();
161         out << item.text;
162         if (!ComparingWords)
163             out << QString(" %1").arg(item.text.at(0).unicode(), 4, 16,
164                                       QChar('0'));
165         if (Yx) {
166             const int y = normalizedY(static_cast<int>(item.rect.y()),
167                                       ToleranceY);
168             out << QString(" (%1, %2)").arg(y).arg(rect.x());
169         }
170         out << "\n";
171     }
172     qDebug("%s", qPrintable(QString("wrote '%1'").arg(filename)));
173 }
174 
175 
normalizedY(const int y,const int ToleranceY)176 inline int normalizedY(const int y, const int ToleranceY)
177 {
178     if (ToleranceY == 0)
179         return y;
180     const int remainder = y % ToleranceY;
181     return (remainder == 0) ? y : y + ToleranceY - remainder;
182 }
183 
184 
getWords(const TextBoxList & list)185 const TextItems getWords(const TextBoxList &list)
186 {
187     TextItems items;
188     foreach (const PdfTextBox &box, list) {
189         QString word = box->text().trimmed();
190         for (int i = 0; i < word.length(); ++i)
191             word[i] = canonicalizedCharacter(word[i]);
192         // DON'T DO: if (!word.isEmpty()) words << word;
193         // since it can mess up highlighting.
194         items.append(TextItem(word, box->boundingBox()));
195     }
196     return items;
197 }
198 
199 
getCharacters(const TextBoxList & list)200 const TextItems getCharacters(const TextBoxList &list)
201 {
202     TextItems items;
203     foreach (const PdfTextBox &box, list) {
204         const QString word = box->text();
205         int limit = word.count() - 1;
206         for (int i = limit; i >= 0; --i)
207             if (!word[i].isSpace())
208                 break;
209         for (int i = 0; i <= limit; ++i) {
210             items.append(TextItem(QString(canonicalizedCharacter(word[i])),
211                                   box->charBoundingBox(i)));
212         }
213     }
214     return items;
215 }
216