1 /**********************************************************************
2  * File:        edgblob.cpp (Formerly edgeloop.c)
3  * Description: Functions to clean up an outline before approximation.
4  * Author:      Ray Smith
5  *
6  *(C) Copyright 1991, Hewlett-Packard Ltd.
7  ** Licensed under the Apache License, Version 2.0(the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 // Include automatically generated configuration file if running autoconf.
20 #ifdef HAVE_CONFIG_H
21 #  include "config_auto.h"
22 #endif
23 
24 #include "edgblob.h"
25 
26 #include "edgloop.h"
27 #include "scanedg.h"
28 
29 #define BUCKETSIZE 16
30 
31 namespace tesseract {
32 
33 // Control parameters used in outline_complexity(), which rejects an outline
34 // if any one of the 3 conditions is satisfied:
35 //  - number of children exceeds edges_max_children_per_outline
36 //  - number of nested layers exceeds edges_max_children_layers
37 //  - joint complexity exceeds edges_children_count_limit(as in child_count())
38 static BOOL_VAR(edges_use_new_outline_complexity, false,
39                 "Use the new outline complexity module");
40 static INT_VAR(edges_max_children_per_outline, 10,
41                "Max number of children inside a character outline");
42 static INT_VAR(edges_max_children_layers, 5,
43                "Max layers of nested children inside a character outline");
44 static BOOL_VAR(edges_debug, false, "turn on debugging for this module");
45 
46 static INT_VAR(edges_children_per_grandchild, 10,
47                "Importance ratio for chucking outlines");
48 static INT_VAR(edges_children_count_limit, 45, "Max holes allowed in blob");
49 static BOOL_VAR(edges_children_fix, false,
50                 "Remove boxy parents of char-like children");
51 static INT_VAR(edges_min_nonhole, 12, "Min pixels for potential char in box");
52 static INT_VAR(edges_patharea_ratio, 40,
53                "Max lensq/area for acceptable child outline");
54 static double_VAR(edges_childarea, 0.5, "Min area fraction of child outline");
55 static double_VAR(edges_boxarea, 0.875,
56                   "Min area fraction of grandchild for box");
57 
58 /**
59  * @name OL_BUCKETS::OL_BUCKETS
60  *
61  * Construct an array of buckets for associating outlines into blobs.
62  */
63 
OL_BUCKETS(ICOORD bleft,ICOORD tright)64 OL_BUCKETS::OL_BUCKETS(ICOORD bleft, // corners
65                        ICOORD tright)
66     : bxdim((tright.x() - bleft.x()) / BUCKETSIZE + 1),
67       bydim((tright.y() - bleft.y()) / BUCKETSIZE + 1),
68       buckets(bxdim * bydim),
69       bl(bleft),
70       tr(tright) {}
71 
72 /**
73  * @name OL_BUCKETS::operator(
74  *
75  * Return a pointer to a list of C_OUTLINEs corresponding to the
76  * given pixel coordinates.
77  */
78 
operator ()(TDimension x,TDimension y)79 C_OUTLINE_LIST *OL_BUCKETS::operator()( // array access
80     TDimension x,                       // image coords
81     TDimension y) {
82   return &buckets[(y - bl.y()) / BUCKETSIZE * bxdim +
83                   (x - bl.x()) / BUCKETSIZE];
84 }
85 
start_scan()86 C_OUTLINE_LIST *OL_BUCKETS::start_scan() {
87   return scan_next(buckets.begin());
88 }
89 
scan_next()90 C_OUTLINE_LIST *OL_BUCKETS::scan_next() {
91   return scan_next(it);
92 }
93 
scan_next(decltype(buckets)::iterator in_it)94 C_OUTLINE_LIST *OL_BUCKETS::scan_next(decltype(buckets)::iterator in_it) {
95   it = std::find_if(in_it, buckets.end(), [](auto &&b) { return !b.empty(); });
96   if (it == buckets.end())
97     return nullptr;
98   return &*it;
99 }
100 
101 /**
102  * @name OL_BUCKETS::outline_complexity
103  *
104  * This is the new version of count_child.
105  *
106  * The goal of this function is to determine if an outline and its
107  * interiors could be part of a character blob.  This is done by
108  * computing a "complexity" index for the outline, which is the return
109  * value of this function, and checking it against a threshold.
110  * The max_count is used for short-circuiting the recursion and forcing
111  * a rejection that guarantees to fail the threshold test.
112  * The complexity F for outline X with N children X[i] is
113  *   F(X) = N + sum_i F(X[i]) * edges_children_per_grandchild
114  * so each layer of nesting increases complexity exponentially.
115  * An outline can be rejected as a text blob candidate if its complexity
116  * is too high, has too many children(likely a container), or has too
117  * many layers of nested inner loops.  This has the side-effect of
118  * flattening out boxed or reversed video text regions.
119  */
120 
outline_complexity(C_OUTLINE * outline,int32_t max_count,int16_t depth)121 int32_t OL_BUCKETS::outline_complexity(C_OUTLINE *outline, // parent outline
122                                        int32_t max_count,  // max output
123                                        int16_t depth       // recurion depth
124 ) {
125   TDimension xmin, xmax;    // coord limits
126   TDimension ymin, ymax;
127   C_OUTLINE *child;         // current child
128   int32_t child_count;      // no of children
129   int32_t grandchild_count; // no of grandchildren
130   C_OUTLINE_IT child_it;    // search iterator
131 
132   TBOX olbox = outline->bounding_box();
133   xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
134   xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
135   ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
136   ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
137   child_count = 0;
138   grandchild_count = 0;
139   if (++depth > edges_max_children_layers) { // nested loops are too deep
140     return max_count + depth;
141   }
142 
143   for (auto yindex = ymin; yindex <= ymax; yindex++) {
144     for (auto xindex = xmin; xindex <= xmax; xindex++) {
145       child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
146       if (child_it.empty()) {
147         continue;
148       }
149       for (child_it.mark_cycle_pt(); !child_it.cycled_list();
150            child_it.forward()) {
151         child = child_it.data();
152         if (child == outline || !(*child < *outline)) {
153           continue;
154         }
155         child_count++;
156 
157         if (child_count > edges_max_children_per_outline) { // too fragmented
158           if (edges_debug) {
159             tprintf(
160                 "Discard outline on child_count=%d > "
161                 "max_children_per_outline=%d\n",
162                 child_count,
163                 static_cast<int32_t>(edges_max_children_per_outline));
164           }
165           return max_count + child_count;
166         }
167 
168         // Compute the "complexity" of each child recursively
169         int32_t remaining_count = max_count - child_count - grandchild_count;
170         if (remaining_count > 0) {
171           grandchild_count += edges_children_per_grandchild *
172                               outline_complexity(child, remaining_count, depth);
173         }
174         if (child_count + grandchild_count > max_count) { // too complex
175           if (edges_debug) {
176             tprintf(
177                 "Disgard outline on child_count=%d + grandchild_count=%d "
178                 "> max_count=%d\n",
179                 child_count, grandchild_count, max_count);
180           }
181           return child_count + grandchild_count;
182         }
183       }
184     }
185   }
186   return child_count + grandchild_count;
187 }
188 
189 /**
190  * @name OL_BUCKETS::count_children
191  *
192  * Find number of descendants of this outline.
193  */
194 // TODO(rays) Merge with outline_complexity.
count_children(C_OUTLINE * outline,int32_t max_count)195 int32_t OL_BUCKETS::count_children( // recursive count
196     C_OUTLINE *outline,             // parent outline
197     int32_t max_count               // max output
198 ) {
199   bool parent_box;    // could it be boxy
200   TDimension xmin, xmax;    // coord limits
201   TDimension ymin, ymax;
202   C_OUTLINE *child;         // current child
203   int32_t child_count;      // no of children
204   int32_t grandchild_count; // no of grandchildren
205   int32_t parent_area;      // potential box
206   float max_parent_area;    // potential box
207   int32_t child_area;       // current child
208   int32_t child_length;     // current child
209   TBOX olbox;
210   C_OUTLINE_IT child_it; // search iterator
211 
212   olbox = outline->bounding_box();
213   xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
214   xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
215   ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
216   ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
217   child_count = 0;
218   grandchild_count = 0;
219   parent_area = 0;
220   max_parent_area = 0;
221   parent_box = true;
222   for (auto yindex = ymin; yindex <= ymax; yindex++) {
223     for (auto xindex = xmin; xindex <= xmax; xindex++) {
224       child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
225       if (child_it.empty()) {
226         continue;
227       }
228       for (child_it.mark_cycle_pt(); !child_it.cycled_list();
229            child_it.forward()) {
230         child = child_it.data();
231         if (child != outline && *child < *outline) {
232           child_count++;
233           if (child_count <= max_count) {
234             int max_grand =
235                 (max_count - child_count) / edges_children_per_grandchild;
236             if (max_grand > 0) {
237               grandchild_count += count_children(child, max_grand) *
238                                   edges_children_per_grandchild;
239             } else {
240               grandchild_count += count_children(child, 1);
241             }
242           }
243           if (child_count + grandchild_count > max_count) {
244             if (edges_debug) {
245               tprintf("Discarding parent with child count=%d, gc=%d\n",
246                       child_count, grandchild_count);
247             }
248             return child_count + grandchild_count;
249           }
250           if (parent_area == 0) {
251             parent_area = outline->outer_area();
252             if (parent_area < 0) {
253               parent_area = -parent_area;
254             }
255             max_parent_area = outline->bounding_box().area() * edges_boxarea;
256             if (parent_area < max_parent_area) {
257               parent_box = false;
258             }
259           }
260           if (parent_box &&
261               (!edges_children_fix ||
262                child->bounding_box().height() > edges_min_nonhole)) {
263             child_area = child->outer_area();
264             if (child_area < 0) {
265               child_area = -child_area;
266             }
267             if (edges_children_fix) {
268               if (parent_area - child_area < max_parent_area) {
269                 parent_box = false;
270                 continue;
271               }
272               if (grandchild_count > 0) {
273                 if (edges_debug) {
274                   tprintf(
275                       "Discarding parent of area %d, child area=%d, max%g "
276                       "with gc=%d\n",
277                       parent_area, child_area, max_parent_area,
278                       grandchild_count);
279                 }
280                 return max_count + 1;
281               }
282               child_length = child->pathlength();
283               if (child_length * child_length >
284                   child_area * edges_patharea_ratio) {
285                 if (edges_debug) {
286                   tprintf(
287                       "Discarding parent of area %d, child area=%d, max%g "
288                       "with child length=%d\n",
289                       parent_area, child_area, max_parent_area, child_length);
290                 }
291                 return max_count + 1;
292               }
293             }
294             if (child_area < child->bounding_box().area() * edges_childarea) {
295               if (edges_debug) {
296                 tprintf(
297                     "Discarding parent of area %d, child area=%d, max%g "
298                     "with child rect=%d\n",
299                     parent_area, child_area, max_parent_area,
300                     child->bounding_box().area());
301               }
302               return max_count + 1;
303             }
304           }
305         }
306       }
307     }
308   }
309   return child_count + grandchild_count;
310 }
311 
312 /**
313  * @name OL_BUCKETS::extract_children
314  *
315  * Find number of descendants of this outline.
316  */
317 
extract_children(C_OUTLINE * outline,C_OUTLINE_IT * it)318 void OL_BUCKETS::extract_children( // recursive count
319     C_OUTLINE *outline,            // parent outline
320     C_OUTLINE_IT *it               // destination iterator
321 ) {
322   TDimension xmin, xmax; // coord limits
323   TDimension ymin, ymax;
324   TBOX olbox;
325   C_OUTLINE_IT child_it; // search iterator
326 
327   olbox = outline->bounding_box();
328   xmin = (olbox.left() - bl.x()) / BUCKETSIZE;
329   xmax = (olbox.right() - bl.x()) / BUCKETSIZE;
330   ymin = (olbox.bottom() - bl.y()) / BUCKETSIZE;
331   ymax = (olbox.top() - bl.y()) / BUCKETSIZE;
332   for (auto yindex = ymin; yindex <= ymax; yindex++) {
333     for (auto xindex = xmin; xindex <= xmax; xindex++) {
334       child_it.set_to_list(&buckets[yindex * bxdim + xindex]);
335       for (child_it.mark_cycle_pt(); !child_it.cycled_list();
336            child_it.forward()) {
337         if (*child_it.data() < *outline) {
338           it->add_after_then_move(child_it.extract());
339         }
340       }
341     }
342   }
343 }
344 
345 /// @name extract_edges
346 
extract_edges(Image pix,BLOCK * block)347 void extract_edges(Image pix,      // thresholded image
348                    BLOCK *block) { // block to scan
349   C_OUTLINE_LIST outlines;         // outlines in block
350   C_OUTLINE_IT out_it = &outlines;
351 
352   block_edges(pix, &(block->pdblk), &out_it);
353   ICOORD bleft; // block box
354   ICOORD tright;
355   block->pdblk.bounding_box(bleft, tright);
356   // make blobs
357   outlines_to_blobs(block, bleft, tright, &outlines);
358 }
359 
360 /// @name fill_buckets
361 
fill_buckets(C_OUTLINE_LIST * outlines,OL_BUCKETS * buckets)362 static void fill_buckets(C_OUTLINE_LIST *outlines, // outlines in block
363                          OL_BUCKETS *buckets       // output buckets
364 ) {
365   C_OUTLINE_IT out_it = outlines; // iterator
366   C_OUTLINE_IT bucket_it;         // iterator in bucket
367 
368   for (out_it.mark_cycle_pt(); !out_it.cycled_list(); out_it.forward()) {
369     auto outline = out_it.extract(); // take off list
370                                      // get box
371     const TBOX &ol_box(outline->bounding_box());
372     bucket_it.set_to_list((*buckets)(ol_box.left(), ol_box.bottom()));
373     bucket_it.add_to_end(outline);
374   }
375 }
376 
377 /**
378  * @name capture_children
379  *
380  * Find all neighbouring outlines that are children of this outline
381  * and either move them to the output list or declare this outline
382  * illegal and return false.
383  */
384 
capture_children(OL_BUCKETS * buckets,C_BLOB_IT * reject_it,C_OUTLINE_IT * blob_it)385 static bool capture_children(OL_BUCKETS *buckets,  // bucket sort class
386                              C_BLOB_IT *reject_it, // dead grandchildren
387                              C_OUTLINE_IT *blob_it // output outlines
388 ) {
389   // master outline
390   auto outline = blob_it->data();
391   // no of children
392   int32_t child_count;
393   if (edges_use_new_outline_complexity) {
394     child_count =
395         buckets->outline_complexity(outline, edges_children_count_limit, 0);
396   } else {
397     child_count = buckets->count_children(outline, edges_children_count_limit);
398   }
399   if (child_count > edges_children_count_limit) {
400     return false;
401   }
402 
403   if (child_count > 0) {
404     buckets->extract_children(outline, blob_it);
405   }
406   return true;
407 }
408 
409 /**
410  * @name empty_buckets
411  *
412  * Run the edge detector over the block and return a list of blobs.
413  */
414 
empty_buckets(BLOCK * block,OL_BUCKETS * buckets)415 static void empty_buckets(BLOCK *block,       // block to scan
416                           OL_BUCKETS *buckets // output buckets
417 ) {
418   C_OUTLINE_LIST outlines; // outlines in block
419                            // iterator
420   C_OUTLINE_IT out_it = &outlines;
421   auto start_scan = buckets->start_scan();
422   if (start_scan == nullptr) {
423     return;
424   }
425   C_OUTLINE_IT bucket_it = start_scan;
426   C_BLOB_IT good_blobs = block->blob_list();
427   C_BLOB_IT junk_blobs = block->reject_blobs();
428 
429   while (!bucket_it.empty()) {
430     out_it.set_to_list(&outlines);
431     C_OUTLINE_IT parent_it; // parent outline
432     do {
433       parent_it = bucket_it; // find outermost
434       do {
435         bucket_it.forward();
436       } while (!bucket_it.at_first() &&
437                !(*parent_it.data() < *bucket_it.data()));
438     } while (!bucket_it.at_first());
439 
440     // move to new list
441     out_it.add_after_then_move(parent_it.extract());
442     // healthy blob
443     bool good_blob = capture_children(buckets, &junk_blobs, &out_it);
444     C_BLOB::ConstructBlobsFromOutlines(good_blob, &outlines, &good_blobs,
445                                        &junk_blobs);
446 
447     if (auto l = buckets->scan_next())
448       bucket_it.set_to_list(l);
449     else
450       break;
451   }
452 }
453 
454 /**
455  * @name outlines_to_blobs
456  *
457  * Gather together outlines into blobs using the usual bucket sort.
458  */
459 
outlines_to_blobs(BLOCK * block,ICOORD bleft,ICOORD tright,C_OUTLINE_LIST * outlines)460 void outlines_to_blobs( // find blobs
461     BLOCK *block,       // block to scan
462     ICOORD bleft, ICOORD tright, C_OUTLINE_LIST *outlines) {
463   // make buckets
464   OL_BUCKETS buckets(bleft, tright);
465 
466   fill_buckets(outlines, &buckets);
467   empty_buckets(block, &buckets);
468 }
469 
470 } // namespace tesseract
471