1 /*
2 Copyright © 2008-13 Qtrac Ltd. All rights reserved.
3 This program or module is free software: you can redistribute it
4 and/or modify it under the terms of the GNU General Public License
5 as published by the Free Software Foundation, either version 2 of
6 the License, or (at your option) any later version. This program is
7 distributed in the hope that it will be useful, but WITHOUT ANY
8 WARRANTY; without even the implied warranty of MERCHANTABILITY or
9 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
10 for more details.
11 */
12
13 #include "textitem.hpp"
14
15 #include <QDir>
16 #include <QFile>
17 #include <QTextStream>
18
19
texts() const20 QStringList TextItems::texts() const
21 {
22 QStringList list;
23 foreach (const TextItem &item, items)
24 list << item.text;
25 return list;
26 }
27
28
rects() const29 QList<QRectF> TextItems::rects() const
30 {
31 QList<QRectF> list;
32 foreach (const TextItem &item, items)
33 list << item.rect;
34 return list;
35 }
36
37 struct Key
38 {
KeyKey39 Key(const int a, const int b, const int c) : a(a), b(b), c(c) {}
40
operator <Key41 bool operator<(const Key &other) const {
42 if (a != other.a) return a < other.a;
43 if (b != other.b) return b < other.b;
44 return c < other.c;
45 }
46
47 int a, b, c;
48 };
49
50
columnZoneYxOrder(const int Width,const int ToleranceR,const int ToleranceY,const int Columns)51 void TextItems::columnZoneYxOrder(const int Width, const int ToleranceR,
52 const int ToleranceY, const int Columns)
53 {
54 // Phase #1: Sort all the texts into (column, y, x) order
55 columnYxOrder(Width, ToleranceY, Columns);
56 // Phase #2: Sort all the texts into (column, zone, y, x) order
57 QList<QPainterPath> zones = generateZones(Width, ToleranceR,
58 ToleranceY, Columns);
59 QMap<Key, TextItem> itemForZoneYx;
60 foreach (const TextItem &item, items) {
61 const QRectF rect = item.rect.adjusted(-ToleranceR, -ToleranceR,
62 ToleranceR, ToleranceR);
63 const int y = normalizedY(static_cast<int>(item.rect.y()),
64 ToleranceY);
65 for (int i = 0; i < zones.count(); ++i) {
66 if (zones.at(i).intersects(rect)) {
67 itemForZoneYx.insert(Key(i, y, item.rect.x()), item);
68 break;
69 }
70 }
71 }
72 items = itemForZoneYx.values();
73 }
74
75
columnYxOrder(const int Width,const int ToleranceY,const int Columns)76 void TextItems::columnYxOrder(const int Width, const int ToleranceY,
77 const int Columns)
78 {
79 // Phase #1: Sort all the texts into (column, y, x) order
80 const int Span = Width / Columns;
81 QMap<Key, TextItem> itemForColumnYx;
82 foreach (const TextItem &item, items) {
83 const QRect &rect = item.toRect();
84 const int Column = ((Columns == 1) ? 0
85 : (rect.width() > Span) ? Columns : rect.right() / Span);
86 const int y = normalizedY(static_cast<int>(rect.y()), ToleranceY);
87 itemForColumnYx.insert(Key(Column, y, rect.x()), item);
88 }
89 items = itemForColumnYx.values();
90 }
91
92
generateZones(const int Width,const int ToleranceR,const int ToleranceY,const int Columns) const93 const QList<QPainterPath> TextItems::generateZones(const int Width,
94 const int ToleranceR, const int ToleranceY,
95 const int Columns) const
96 { // Assumes that items are already in column, y, x order!
97 // Phase #1: Generate the zones
98 QList<QPainterPath> zones;
99 foreach (const TextItem &item, items) {
100 if (zones.isEmpty()) { // First word becomes first zone
101 QPainterPath zone;
102 zone.addRect(item.rect);
103 zones << zone;
104 } else { // Add to an existing zone within tolerance or a new one
105 const QRectF tolerantRect = item.rect.adjusted(-ToleranceR,
106 -ToleranceR, ToleranceR, ToleranceR);
107 bool found = false;
108 for (int i = 0; i < zones.count(); ++i) {
109 QPainterPath zone = zones.at(i);
110 if (zone.intersects(tolerantRect)) {
111 zone.addRect(item.rect);
112 zones[i] = zone;
113 found = true;
114 break;
115 }
116 }
117 if (!found) {
118 QPainterPath zone;
119 zone.addRect(item.rect);
120 zones << zone;
121 }
122 }
123 }
124
125 // Phase #2: Order the zones by (column, y, x)
126 const int Span = Width / Columns;
127 QMultiMap<Key, QPainterPath> zonesForColumn;
128 foreach (const QPainterPath &zone, zones) {
129 const QRect &rect = zone.boundingRect().toRect();
130 const int Column = ((Columns == 1) ? 0
131 : (rect.width() > Span) ? Columns : rect.right() / Span);
132 const int y = normalizedY(static_cast<int>(rect.y()), ToleranceY);
133 zonesForColumn.insertMulti(Key(Column, y, rect.x()), zone);
134 }
135 return zonesForColumn.values();
136 }
137
138
debug(const int page,const int ToleranceY,const bool ComparingWords,const bool Yx)139 void TextItems::debug(const int page, const int ToleranceY,
140 const bool ComparingWords, const bool Yx)
141 {
142 QString filename = QDir::tempPath() + QString("/page1-%1.txt")
143 .arg(page);
144 int count = 1;
145 while (QFile::exists(filename)) {
146 filename = QDir::tempPath() + QString("/page%2-%1.txt").arg(page)
147 .arg(++count);
148 }
149 QFile file(filename);
150 if (!file.open(QIODevice::WriteOnly|QIODevice::Text)) {
151 qDebug("%s", qPrintable(QString("cannot write debug file: %1")
152 .arg(file.errorString())));
153 }
154 QTextStream out(&file);
155 out.setCodec("UTF-8");
156 out << "Page #" << page << ": "
157 << (ComparingWords ? "Words" : "Characters") << " mode\n";
158 for (int i = 0; i < items.count(); ++i) {
159 const TextItem &item = items.at(i);
160 const QRect rect = item.toRect();
161 out << item.text;
162 if (!ComparingWords)
163 out << QString(" %1").arg(item.text.at(0).unicode(), 4, 16,
164 QChar('0'));
165 if (Yx) {
166 const int y = normalizedY(static_cast<int>(item.rect.y()),
167 ToleranceY);
168 out << QString(" (%1, %2)").arg(y).arg(rect.x());
169 }
170 out << "\n";
171 }
172 qDebug("%s", qPrintable(QString("wrote '%1'").arg(filename)));
173 }
174
175
normalizedY(const int y,const int ToleranceY)176 inline int normalizedY(const int y, const int ToleranceY)
177 {
178 if (ToleranceY == 0)
179 return y;
180 const int remainder = y % ToleranceY;
181 return (remainder == 0) ? y : y + ToleranceY - remainder;
182 }
183
184
getWords(const TextBoxList & list)185 const TextItems getWords(const TextBoxList &list)
186 {
187 TextItems items;
188 foreach (const PdfTextBox &box, list) {
189 QString word = box->text().trimmed();
190 for (int i = 0; i < word.length(); ++i)
191 word[i] = canonicalizedCharacter(word[i]);
192 // DON'T DO: if (!word.isEmpty()) words << word;
193 // since it can mess up highlighting.
194 items.append(TextItem(word, box->boundingBox()));
195 }
196 return items;
197 }
198
199
getCharacters(const TextBoxList & list)200 const TextItems getCharacters(const TextBoxList &list)
201 {
202 TextItems items;
203 foreach (const PdfTextBox &box, list) {
204 const QString word = box->text();
205 int limit = word.count() - 1;
206 for (int i = limit; i >= 0; --i)
207 if (!word[i].isSpace())
208 break;
209 for (int i = 0; i <= limit; ++i) {
210 items.append(TextItem(QString(canonicalizedCharacter(word[i])),
211 box->charBoundingBox(i)));
212 }
213 }
214 return items;
215 }
216