1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10
11 #include "gap_map.h"
12
13 #include "statistc.h"
14
15 namespace tesseract {
16
17 BOOL_VAR(gapmap_debug, false, "Say which blocks have tables");
18 BOOL_VAR(gapmap_use_ends, false, "Use large space at start and end of rows");
19 BOOL_VAR(gapmap_no_isolated_quanta, false, "Ensure gaps not less than 2quanta wide");
20 double_VAR(gapmap_big_gaps, 1.75, "xht multiplier");
21
22 /*************************************************************************
23 * A block gap map is a quantised histogram of whitespace regions in the
24 * block. It is a vertical projection of wide gaps WITHIN lines
25 *
26 * The map is held as an array of counts of rows which have a wide gap
27 * covering that region of the row. Each bucket in the map represents a width
28 * of about half an xheight - (The median of the xhts in the rows is used.)
29 *
30 * The block is considered RECTANGULAR - delimited by the left and right
31 * extremes of the rows in the block. However, ONLY wide gaps WITHIN a row are
32 * counted.
33 *
34 *************************************************************************/
35
GAPMAP(TO_BLOCK * block)36 GAPMAP::GAPMAP( // Constructor
37 TO_BLOCK *block // block
38 ) {
39 TO_ROW *row; // current row
40 BLOBNBOX_IT blob_it; // iterator
41 TBOX blob_box;
42 TBOX prev_blob_box;
43 int16_t gap_width;
44 int16_t start_of_row;
45 int16_t end_of_row;
46 STATS xht_stats(0, 128);
47 int16_t min_quantum;
48 int16_t max_quantum;
49 int16_t i;
50
51 /*
52 Find left and right extremes and bucket size
53 */
54 map = nullptr;
55 min_left = INT16_MAX;
56 max_right = -INT16_MAX;
57 total_rows = 0;
58 any_tabs = false;
59
60 // row iterator
61 TO_ROW_IT row_it(block->get_rows());
62 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
63 row = row_it.data();
64 if (!row->blob_list()->empty()) {
65 total_rows++;
66 xht_stats.add(static_cast<int16_t>(floor(row->xheight + 0.5)), 1);
67 blob_it.set_to_list(row->blob_list());
68 start_of_row = blob_it.data()->bounding_box().left();
69 end_of_row = blob_it.data_relative(-1)->bounding_box().right();
70 if (min_left > start_of_row) {
71 min_left = start_of_row;
72 }
73 if (max_right < end_of_row) {
74 max_right = end_of_row;
75 }
76 }
77 }
78 if ((total_rows < 3) || (min_left >= max_right)) {
79 bucket_size = 0;
80 map_max = 0;
81 total_rows = 0;
82 min_left = max_right = 0;
83 return;
84 }
85 bucket_size = static_cast<int16_t>(floor(xht_stats.median() + 0.5)) / 2;
86 map_max = (max_right - min_left) / bucket_size;
87 map = new int16_t[map_max + 1];
88 for (i = 0; i <= map_max; i++) {
89 map[i] = 0;
90 }
91
92 for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
93 row = row_it.data();
94 if (!row->blob_list()->empty()) {
95 blob_it.set_to_list(row->blob_list());
96 blob_it.mark_cycle_pt();
97 blob_box = box_next(&blob_it);
98 prev_blob_box = blob_box;
99 if (gapmap_use_ends) {
100 /* Leading space */
101 gap_width = blob_box.left() - min_left;
102 if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
103 max_quantum = (blob_box.left() - min_left) / bucket_size;
104 if (max_quantum > map_max) {
105 max_quantum = map_max;
106 }
107 for (i = 0; i <= max_quantum; i++) {
108 map[i]++;
109 }
110 }
111 }
112 while (!blob_it.cycled_list()) {
113 blob_box = box_next(&blob_it);
114 gap_width = blob_box.left() - prev_blob_box.right();
115 if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
116 min_quantum = (prev_blob_box.right() - min_left) / bucket_size;
117 max_quantum = (blob_box.left() - min_left) / bucket_size;
118 if (max_quantum > map_max) {
119 max_quantum = map_max;
120 }
121 for (i = min_quantum; i <= max_quantum; i++) {
122 map[i]++;
123 }
124 }
125 prev_blob_box = blob_box;
126 }
127 if (gapmap_use_ends) {
128 /* Trailing space */
129 gap_width = max_right - prev_blob_box.right();
130 if ((gap_width > gapmap_big_gaps * row->xheight) && gap_width > 2) {
131 min_quantum = (prev_blob_box.right() - min_left) / bucket_size;
132 if (min_quantum < 0) {
133 min_quantum = 0;
134 }
135 for (i = min_quantum; i <= map_max; i++) {
136 map[i]++;
137 }
138 }
139 }
140 }
141 }
142 for (i = 0; i <= map_max; i++) {
143 if (map[i] > total_rows / 2) {
144 if (gapmap_no_isolated_quanta &&
145 (((i == 0) && (map[i + 1] <= total_rows / 2)) ||
146 ((i == map_max) && (map[i - 1] <= total_rows / 2)) ||
147 ((i > 0) && (i < map_max) && (map[i - 1] <= total_rows / 2) &&
148 (map[i + 1] <= total_rows / 2)))) {
149 map[i] = 0; // prevent isolated quantum
150 } else {
151 any_tabs = true;
152 }
153 }
154 }
155 if (gapmap_debug && any_tabs) {
156 tprintf("Table found\n");
157 }
158 }
159
160 /*************************************************************************
161 * GAPMAP::table_gap()
162 * Is there a bucket in the specified range where more than half the rows in the
163 * block have a wide gap?
164 *************************************************************************/
165
table_gap(int16_t left,int16_t right)166 bool GAPMAP::table_gap( // Is gap a table?
167 int16_t left, // From here
168 int16_t right // To here
169 ) {
170 int16_t min_quantum;
171 int16_t max_quantum;
172 int16_t i;
173 bool tab_found = false;
174
175 if (!any_tabs) {
176 return false;
177 }
178
179 min_quantum = (left - min_left) / bucket_size;
180 max_quantum = (right - min_left) / bucket_size;
181 // Clip to the bounds of the array. In some circumstances (big blob followed
182 // by small blob) max_quantum can exceed the map_max bounds, but we clip
183 // here instead, as it provides better long-term safety.
184 if (min_quantum < 0) {
185 min_quantum = 0;
186 }
187 if (max_quantum > map_max) {
188 max_quantum = map_max;
189 }
190 for (i = min_quantum; (!tab_found && (i <= max_quantum)); i++) {
191 if (map[i] > total_rows / 2) {
192 tab_found = true;
193 }
194 }
195 return tab_found;
196 }
197
198 } // namespace tesseract
199