1 /**********************************************************************
2 * File: edgblob.cpp (Formerly edgeloop.c)
3 * Description: Functions to clean up an outline before approximation.
4 * Author: Ray Smith
5 *
6 *(C) Copyright 1991, Hewlett-Packard Ltd.
7 ** Licensed under the Apache License, Version 2.0(the "License");
8 ** you may not use this file except in compliance with the License.
9 ** You may obtain a copy of the License at
10 ** http://www.apache.org/licenses/LICENSE-2.0
11 ** Unless required by applicable law or agreed to in writing, software
12 ** distributed under the License is distributed on an "AS IS" BASIS,
13 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 ** See the License for the specific language governing permissions and
15 ** limitations under the License.
16 *
17 **********************************************************************/
18
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 # include "config_auto.h"
22 #endif
23
24 #include "edgblob.h"
25
26 #include "edgloop.h"
27 #include "scanedg.h"
28
29 #define BUCKETSIZE 16
30
31 namespace tesseract {
32
33 // Control parameters used in outline_complexity(), which rejects an outline
34 // if any one of the 3 conditions is satisfied:
35 // - number of children exceeds edges_max_children_per_outline
36 // - number of nested layers exceeds edges_max_children_layers
37 // - joint complexity exceeds edges_children_count_limit(as in child_count())
38 static BOOL_VAR(edges_use_new_outline_complexity, false,
39 "Use the new outline complexity module");
40 static INT_VAR(edges_max_children_per_outline, 10,
41 "Max number of children inside a character outline");
42 static INT_VAR(edges_max_children_layers, 5,
43 "Max layers of nested children inside a character outline");
44 static BOOL_VAR(edges_debug, false, "turn on debugging for this module");
45
46 static INT_VAR(edges_children_per_grandchild, 10,
47 "Importance ratio for chucking outlines");
48 static INT_VAR(edges_children_count_limit, 45, "Max holes allowed in blob");
49 static BOOL_VAR(edges_children_fix, false,
50 "Remove boxy parents of char-like children");
51 static INT_VAR(edges_min_nonhole, 12, "Min pixels for potential char in box");
52 static INT_VAR(edges_patharea_ratio, 40,
53 "Max lensq/area for acceptable child outline");
54 static double_VAR(edges_childarea, 0.5, "Min area fraction of child outline");
55 static double_VAR(edges_boxarea, 0.875,
56 "Min area fraction of grandchild for box");
57
58 /**
59 * @name OL_BUCKETS::OL_BUCKETS
60 *
61 * Construct an array of buckets for associating outlines into blobs.
62 */
63
OL_BUCKETS(ICOORD bleft,ICOORD tright)64 OL_BUCKETS::OL_BUCKETS(ICOORD bleft, // corners
65 ICOORD tright)
66 : bxdim((tright.x() - bleft.x()) / BUCKETSIZE + 1),
67 bydim((tright.y() - bleft.y()) / BUCKETSIZE + 1),
68 buckets(bxdim * bydim),
69 bl(bleft),
70 tr(tright) {}
71
72 /**
73 * @name OL_BUCKETS::operator(
74 *
75 * Return a pointer to a list of C_OUTLINEs corresponding to the
76 * given pixel coordinates.
77 */
78
operator ()(TDimension x,TDimension y)79 C_OUTLINE_LIST *OL_BUCKETS::operator()( // array access
80 TDimension x, // image coords
81 TDimension y) {
82 return &buckets[(y - bl.y()) / BUCKETSIZE * bxdim +
83 (x - bl.x()) / BUCKETSIZE];
84 }
85
start_scan()86 C_OUTLINE_LIST *OL_BUCKETS::start_scan() {
87 return scan_next(buckets.begin());
88 }
89
scan_next()90 C_OUTLINE_LIST *OL_BUCKETS::scan_next() {
91 return scan_next(it);
92 }
93
scan_next(decltype(buckets)::iterator in_it)94 C_OUTLINE_LIST *OL_BUCKETS::scan_next(decltype(buckets)::iterator in_it) {
95 it = std::find_if(in_it, buckets.end(), [](auto &&b) { return !b.empty(); });
96 if (it == buckets.end())
97 return nullptr;
98 return &*it;
99 }
100
101 /**
102 * @name OL_BUCKETS::outline_complexity
103 *
104 * This is the new version of count_child.
105 *
106 * The goal of this function is to determine if an outline and its
107 * interiors could be part of a character blob. This is done by
108 * computing a "complexity" index for the outline, which is the return
109 * value of this function, and checking it against a threshold.
110 * The max_count is used for short-circuiting the recursion and forcing
111 * a rejection that guarantees to fail the threshold test.
112 * The complexity F for outline X with N children X[i] is
113 * F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild
114 * so each layer of nesting increases complexity exponentially.
115 * An outline can be rejected as a text blob candidate if its complexity
116 * is too high, has too many children(likely a container), or has too
117 * many layers of nested inner loops. This has the side-effect of
118 * flattening out boxed or reversed video text regions.
119 */
120
outline_complexity(C_OUTLINE * outline,int32_t max_count,int16_t depth)121 int32_t OL_BUCKETS::outline_complexity(C_OUTLINE *outline, // parent outline
122 int32_t max_count, // max output
123 int16_t depth // recurion depth
124 ) {
125 TDimension xmin, xmax; // coord limits
126 TDimension ymin, ymax;
127 C_OUTLINE *child; // current child
128 int32_t child_count; // no of children
129 int32_t grandchild_count; // no of grandchildren
130 C_OUTLINE_IT child_it; // search iterator
131
132 TBOX olbox = outline->bounding_box();
133 xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
134 xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
135 ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
136 ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
137 child_count = 0;
138 grandchild_count = 0;
139 if (++depth > edges_max_children_layers) { // nested loops are too deep
140 return max_count + depth;
141 }
142
143 for (auto yindex = ymin; yindex <= ymax; yindex++) {
144 for (auto xindex = xmin; xindex <= xmax; xindex++) {
145 child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
146 if (child_it.empty()) {
147 continue;
148 }
149 for (child_it.mark_cycle_pt(); !child_it.cycled_list();
150 child_it.forward()) {
151 child = child_it.data();
152 if (child == outline || !(*child < *outline)) {
153 continue;
154 }
155 child_count++;
156
157 if (child_count > edges_max_children_per_outline) { // too fragmented
158 if (edges_debug) {
159 tprintf(
160 "Discard outline on child_count=%d > "
161 "max_children_per_outline=%d\n",
162 child_count,
163 static_cast<int32_t>(edges_max_children_per_outline));
164 }
165 return max_count + child_count;
166 }
167
168 // Compute the "complexity" of each child recursively
169 int32_t remaining_count = max_count - child_count - grandchild_count;
170 if (remaining_count > 0) {
171 grandchild_count += edges_children_per_grandchild *
172 outline_complexity(child, remaining_count, depth);
173 }
174 if (child_count + grandchild_count > max_count) { // too complex
175 if (edges_debug) {
176 tprintf(
177 "Disgard outline on child_count=%d + grandchild_count=%d "
178 "> max_count=%d\n",
179 child_count, grandchild_count, max_count);
180 }
181 return child_count + grandchild_count;
182 }
183 }
184 }
185 }
186 return child_count + grandchild_count;
187 }
188
189 /**
190 * @name OL_BUCKETS::count_children
191 *
192 * Find number of descendants of this outline.
193 */
194 // TODO(rays) Merge with outline_complexity.
count_children(C_OUTLINE * outline,int32_t max_count)195 int32_t OL_BUCKETS::count_children( // recursive count
196 C_OUTLINE *outline, // parent outline
197 int32_t max_count // max output
198 ) {
199 bool parent_box; // could it be boxy
200 TDimension xmin, xmax; // coord limits
201 TDimension ymin, ymax;
202 C_OUTLINE *child; // current child
203 int32_t child_count; // no of children
204 int32_t grandchild_count; // no of grandchildren
205 int32_t parent_area; // potential box
206 float max_parent_area; // potential box
207 int32_t child_area; // current child
208 int32_t child_length; // current child
209 TBOX olbox;
210 C_OUTLINE_IT child_it; // search iterator
211
212 olbox = outline->bounding_box();
213 xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
214 xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
215 ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
216 ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
217 child_count = 0;
218 grandchild_count = 0;
219 parent_area = 0;
220 max_parent_area = 0;
221 parent_box = true;
222 for (auto yindex = ymin; yindex <= ymax; yindex++) {
223 for (auto xindex = xmin; xindex <= xmax; xindex++) {
224 child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
225 if (child_it.empty()) {
226 continue;
227 }
228 for (child_it.mark_cycle_pt(); !child_it.cycled_list();
229 child_it.forward()) {
230 child = child_it.data();
231 if (child != outline && *child < *outline) {
232 child_count++;
233 if (child_count <= max_count) {
234 int max_grand =
235 (max_count - child_count) / edges_children_per_grandchild;
236 if (max_grand > 0) {
237 grandchild_count += count_children(child, max_grand) *
238 edges_children_per_grandchild;
239 } else {
240 grandchild_count += count_children(child, 1);
241 }
242 }
243 if (child_count + grandchild_count > max_count) {
244 if (edges_debug) {
245 tprintf("Discarding parent with child count=%d, gc=%d\n",
246 child_count, grandchild_count);
247 }
248 return child_count + grandchild_count;
249 }
250 if (parent_area == 0) {
251 parent_area = outline->outer_area();
252 if (parent_area < 0) {
253 parent_area = -parent_area;
254 }
255 max_parent_area = outline->bounding_box().area() * edges_boxarea;
256 if (parent_area < max_parent_area) {
257 parent_box = false;
258 }
259 }
260 if (parent_box &&
261 (!edges_children_fix ||
262 child->bounding_box().height() > edges_min_nonhole)) {
263 child_area = child->outer_area();
264 if (child_area < 0) {
265 child_area = -child_area;
266 }
267 if (edges_children_fix) {
268 if (parent_area - child_area < max_parent_area) {
269 parent_box = false;
270 continue;
271 }
272 if (grandchild_count > 0) {
273 if (edges_debug) {
274 tprintf(
275 "Discarding parent of area %d, child area=%d, max%g "
276 "with gc=%d\n",
277 parent_area, child_area, max_parent_area,
278 grandchild_count);
279 }
280 return max_count + 1;
281 }
282 child_length = child->pathlength();
283 if (child_length * child_length >
284 child_area * edges_patharea_ratio) {
285 if (edges_debug) {
286 tprintf(
287 "Discarding parent of area %d, child area=%d, max%g "
288 "with child length=%d\n",
289 parent_area, child_area, max_parent_area, child_length);
290 }
291 return max_count + 1;
292 }
293 }
294 if (child_area < child->bounding_box().area() * edges_childarea) {
295 if (edges_debug) {
296 tprintf(
297 "Discarding parent of area %d, child area=%d, max%g "
298 "with child rect=%d\n",
299 parent_area, child_area, max_parent_area,
300 child->bounding_box().area());
301 }
302 return max_count + 1;
303 }
304 }
305 }
306 }
307 }
308 }
309 return child_count + grandchild_count;
310 }
311
312 /**
313 * @name OL_BUCKETS::extract_children
314 *
315 * Find number of descendants of this outline.
316 */
317
extract_children(C_OUTLINE * outline,C_OUTLINE_IT * it)318 void OL_BUCKETS::extract_children( // recursive count
319 C_OUTLINE *outline, // parent outline
320 C_OUTLINE_IT *it // destination iterator
321 ) {
322 TDimension xmin, xmax; // coord limits
323 TDimension ymin, ymax;
324 TBOX olbox;
325 C_OUTLINE_IT child_it; // search iterator
326
327 olbox = outline->bounding_box();
328 xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
329 xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
330 ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
331 ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
332 for (auto yindex = ymin; yindex <= ymax; yindex++) {
333 for (auto xindex = xmin; xindex <= xmax; xindex++) {
334 child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
335 for (child_it.mark_cycle_pt(); !child_it.cycled_list();
336 child_it.forward()) {
337 if (*child_it.data() < *outline) {
338 it->add_after_then_move(child_it.extract());
339 }
340 }
341 }
342 }
343 }
344
345 /// @name extract_edges
346
extract_edges(Image pix,BLOCK * block)347 void extract_edges(Image pix, // thresholded image
348 BLOCK *block) { // block to scan
349 C_OUTLINE_LIST outlines; // outlines in block
350 C_OUTLINE_IT out_it = &outlines;
351
352 block_edges(pix, &(block->pdblk), &out_it);
353 ICOORD bleft; // block box
354 ICOORD tright;
355 block->pdblk.bounding_box(bleft, tright);
356 // make blobs
357 outlines_to_blobs(block, bleft, tright, &outlines);
358 }
359
360 /// @name fill_buckets
361
fill_buckets(C_OUTLINE_LIST * outlines,OL_BUCKETS * buckets)362 static void fill_buckets(C_OUTLINE_LIST *outlines, // outlines in block
363 OL_BUCKETS *buckets // output buckets
364 ) {
365 C_OUTLINE_IT out_it = outlines; // iterator
366 C_OUTLINE_IT bucket_it; // iterator in bucket
367
368 for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
369 auto outline = out_it.extract(); // take off list
370 // get box
371 const TBOX &ol_box(outline->bounding_box());
372 bucket_it.set_to_list((*buckets)(ol_box.left(), ol_box.bottom()));
373 bucket_it.add_to_end(outline);
374 }
375 }
376
377 /**
378 * @name capture_children
379 *
380 * Find all neighbouring outlines that are children of this outline
381 * and either move them to the output list or declare this outline
382 * illegal and return false.
383 */
384
capture_children(OL_BUCKETS * buckets,C_BLOB_IT * reject_it,C_OUTLINE_IT * blob_it)385 static bool capture_children(OL_BUCKETS *buckets, // bucket sort class
386 C_BLOB_IT *reject_it, // dead grandchildren
387 C_OUTLINE_IT *blob_it // output outlines
388 ) {
389 // master outline
390 auto outline = blob_it->data();
391 // no of children
392 int32_t child_count;
393 if (edges_use_new_outline_complexity) {
394 child_count =
395 buckets->outline_complexity(outline, edges_children_count_limit, 0);
396 } else {
397 child_count = buckets->count_children(outline, edges_children_count_limit);
398 }
399 if (child_count > edges_children_count_limit) {
400 return false;
401 }
402
403 if (child_count > 0) {
404 buckets->extract_children(outline, blob_it);
405 }
406 return true;
407 }
408
409 /**
410 * @name empty_buckets
411 *
412 * Run the edge detector over the block and return a list of blobs.
413 */
414
empty_buckets(BLOCK * block,OL_BUCKETS * buckets)415 static void empty_buckets(BLOCK *block, // block to scan
416 OL_BUCKETS *buckets // output buckets
417 ) {
418 C_OUTLINE_LIST outlines; // outlines in block
419 // iterator
420 C_OUTLINE_IT out_it = &outlines;
421 auto start_scan = buckets->start_scan();
422 if (start_scan == nullptr) {
423 return;
424 }
425 C_OUTLINE_IT bucket_it = start_scan;
426 C_BLOB_IT good_blobs = block->blob_list();
427 C_BLOB_IT junk_blobs = block->reject_blobs();
428
429 while (!bucket_it.empty()) {
430 out_it.set_to_list(&outlines);
431 C_OUTLINE_IT parent_it; // parent outline
432 do {
433 parent_it = bucket_it; // find outermost
434 do {
435 bucket_it.forward();
436 } while (!bucket_it.at_first() &&
437 !(*parent_it.data() < *bucket_it.data()));
438 } while (!bucket_it.at_first());
439
440 // move to new list
441 out_it.add_after_then_move(parent_it.extract());
442 // healthy blob
443 bool good_blob = capture_children(buckets, &junk_blobs, &out_it);
444 C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs,
445 &junk_blobs);
446
447 if (auto l = buckets->scan_next())
448 bucket_it.set_to_list(l);
449 else
450 break;
451 }
452 }
453
454 /**
455 * @name outlines_to_blobs
456 *
457 * Gather together outlines into blobs using the usual bucket sort.
458 */
459
outlines_to_blobs(BLOCK * block,ICOORD bleft,ICOORD tright,C_OUTLINE_LIST * outlines)460 void outlines_to_blobs( // find blobs
461 BLOCK *block, // block to scan
462 ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines) {
463 // make buckets
464 OL_BUCKETS buckets(bleft, tright);
465
466 fill_buckets(outlines, &buckets);
467 empty_buckets(block, &buckets);
468 }
469
470 } // namespace tesseract
471