1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 
11 #include "gap_map.h"
12 
13 #include "statistc.h"
14 
15 namespace tesseract {
16 
17 BOOL_VAR(gapmap_debug, false, "Say which blocks have tables");
18 BOOL_VAR(gapmap_use_ends, false, "Use large space at start and end of rows");
19 BOOL_VAR(gapmap_no_isolated_quanta, false, "Ensure gaps not less than 2quanta wide");
20 double_VAR(gapmap_big_gaps, 1.75, "xht multiplier");
21 
22 /*************************************************************************
23  * A block gap map is a quantised histogram of whitespace regions in the
24  * block. It is a vertical projection of wide gaps WITHIN lines
25  *
26  * The map is held as an array of counts of rows which have a wide gap
27  * covering that region of the row. Each bucket in the map represents a width
28  * of about half an xheight - (The median of the xhts in the rows is used.)
29  *
30  * The block is considered RECTANGULAR - delimited by the left and right
31  * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
32  * counted.
33  *
34  *************************************************************************/
35 
GAPMAP(TO_BLOCK * block)36 GAPMAP::GAPMAP(     // Constructor
37     TO_BLOCK *block // block
38 ) {
39   TO_ROW *row;         // current row
40   BLOBNBOX_IT blob_it; // iterator
41   TBOX blob_box;
42   TBOX prev_blob_box;
43   int16_t gap_width;
44   int16_t start_of_row;
45   int16_t end_of_row;
46   STATS xht_stats(0, 128);
47   int16_t min_quantum;
48   int16_t max_quantum;
49   int16_t i;
50 
51   /*
52   Find left and right extremes and bucket size
53 */
54   map = nullptr;
55   min_left = INT16_MAX;
56   max_right = -INT16_MAX;
57   total_rows = 0;
58   any_tabs = false;
59 
60   // row iterator
61   TO_ROW_IT row_it(block->get_rows());
62   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
63     row = row_it.data();
64     if (!row->blob_list()->empty()) {
65       total_rows++;
66       xht_stats.add(static_cast<int16_t>(floor(row->xheight + 0.5)), 1);
67       blob_it.set_to_list(row->blob_list());
68       start_of_row = blob_it.data()->bounding_box().left();
69       end_of_row = blob_it.data_relative(-1)->bounding_box().right();
70       if (min_left > start_of_row) {
71         min_left = start_of_row;
72       }
73       if (max_right < end_of_row) {
74         max_right = end_of_row;
75       }
76     }
77   }
78   if ((total_rows < 3) || (min_left >= max_right)) {
79     bucket_size = 0;
80     map_max = 0;
81     total_rows = 0;
82     min_left = max_right = 0;
83     return;
84   }
85   bucket_size = static_cast<int16_t>(floor(xht_stats.median() + 0.5)) / 2;
86   map_max = (max_right - min_left) / bucket_size;
87   map = new int16_t[map_max + 1];
88   for (i = 0; i <= map_max; i++) {
89     map[i] = 0;
90   }
91 
92   for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
93     row = row_it.data();
94     if (!row->blob_list()->empty()) {
95       blob_it.set_to_list(row->blob_list());
96       blob_it.mark_cycle_pt();
97       blob_box = box_next(&blob_it);
98       prev_blob_box = blob_box;
99       if (gapmap_use_ends) {
100         /* Leading space */
101         gap_width = blob_box.left() - min_left;
102         if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
103           max_quantum = (blob_box.left() - min_left) / bucket_size;
104           if (max_quantum > map_max) {
105             max_quantum = map_max;
106           }
107           for (i = 0; i <= max_quantum; i++) {
108             map[i]++;
109           }
110         }
111       }
112       while (!blob_it.cycled_list()) {
113         blob_box = box_next(&blob_it);
114         gap_width = blob_box.left() - prev_blob_box.right();
115         if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
116           min_quantum = (prev_blob_box.right() - min_left) / bucket_size;
117           max_quantum = (blob_box.left() - min_left) / bucket_size;
118           if (max_quantum > map_max) {
119             max_quantum = map_max;
120           }
121           for (i = min_quantum; i <= max_quantum; i++) {
122             map[i]++;
123           }
124         }
125         prev_blob_box = blob_box;
126       }
127       if (gapmap_use_ends) {
128         /* Trailing space */
129         gap_width = max_right - prev_blob_box.right();
130         if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
131           min_quantum = (prev_blob_box.right() - min_left) / bucket_size;
132           if (min_quantum < 0) {
133             min_quantum = 0;
134           }
135           for (i = min_quantum; i <= map_max; i++) {
136             map[i]++;
137           }
138         }
139       }
140     }
141   }
142   for (i = 0; i <= map_max; i++) {
143     if (map[i] > total_rows / 2) {
144       if (gapmap_no_isolated_quanta &&
145           (((i == 0) && (map[i + 1] <= total_rows / 2)) ||
146            ((i == map_max) && (map[i - 1] <= total_rows / 2)) ||
147            ((i > 0) && (i < map_max) && (map[i - 1] <= total_rows / 2) &&
148             (map[i + 1] <= total_rows / 2)))) {
149         map[i] = 0; // prevent isolated quantum
150       } else {
151         any_tabs = true;
152       }
153     }
154   }
155   if (gapmap_debug && any_tabs) {
156     tprintf("Table found\n");
157   }
158 }
159 
160 /*************************************************************************
161  * GAPMAP::table_gap()
162  * Is there a bucket in the specified range where more than half the rows in the
163  * block have a wide gap?
164  *************************************************************************/
165 
table_gap(int16_t left,int16_t right)166 bool GAPMAP::table_gap( // Is gap a table?
167     int16_t left,       // From here
168     int16_t right       // To here
169 ) {
170   int16_t min_quantum;
171   int16_t max_quantum;
172   int16_t i;
173   bool tab_found = false;
174 
175   if (!any_tabs) {
176     return false;
177   }
178 
179   min_quantum = (left - min_left) / bucket_size;
180   max_quantum = (right - min_left) / bucket_size;
181   // Clip to the bounds of the array. In some circumstances (big blob followed
182   // by small blob) max_quantum can exceed the map_max bounds, but we clip
183   // here instead, as it provides better long-term safety.
184   if (min_quantum < 0) {
185     min_quantum = 0;
186   }
187   if (max_quantum > map_max) {
188     max_quantum = map_max;
189   }
190   for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) {
191     if (map[i] > total_rows / 2) {
192       tab_found = true;
193     }
194   }
195   return tab_found;
196 }
197 
198 } // namespace tesseract
199