1 ///////////////////////////////////////////////////////////////////////
2 // File:        tablerecog.h
3 // Description: Functions to detect structure of tables.
4 // Author:    Nicholas Beato
5 //
6 // (C) Copyright 2010, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TABLERECOG_H_
20 #define TABLERECOG_H_
21 
22 #include "colpartitiongrid.h"
23 
24 namespace tesseract {
25 
26 // There are 2 classes in this file. They have 2 different purposes.
27 //  - StructuredTable contains the methods to find the structure given
28 //    a specific bounding box and grow that structure.
29 //  - TableRecognizer contains the methods to adjust the possible positions
30 //    of a table without worrying about structure.
31 //
32 // To use these classes, the assumption is that the TableFinder will
33 // have a guess of the location of a table (or possibly over/undersegmented
34 // tables). The TableRecognizer is responsible for finding the table boundaries
35 // at a high level. The StructuredTable class is responsible for determining
36 // the structure of the table and trying to maximize its bounds while retaining
37 // the structure.
38 // (The latter part is not implemented yet, but that was the goal).
39 //
40 // While on the boundary discussion, keep in mind that this is a first pass.
41 // There should eventually be some things like internal structure checks,
42 // and, more importantly, surrounding text flow checks.
43 //
44 
45 // Usage:
46 // The StructuredTable class contains methods to query a potential table.
47 // It has functions to find structure, count rows, find ColPartitions that
48 // intersect gridlines, etc. It is not meant to blindly find a table. It
49 // is meant to start with a known table location and enhance it.
50 // Usage:
51 //    ColPartitionGrid text_grid, line_grid;  // init
52 //    TBOX table_box;  // known location of table location
53 //
54 //    StructuredTable table;
55 //    table.Init();  // construction code
56 //    table.set_text_grid(/* text */);  // These 2 grids can be the same!
57 //    table.set_line_grid(/* lines */);
58 //    table.set_min_text_height(10);    // Filter vertical and tall text.
59 //    // IMPORTANT! The table needs to be told where it is!
60 //    table.set_bounding_box(table_box);  // Set initial table location.
61 //    if (table.FindWhitespacedStructure()) {
62 //      // process table
63 //      table.column_count();  // number of columns
64 //      table.row_count();     // number of rows
65 //      table.cells_count();   // number of cells
66 //      table.bounding_box();  // updated bounding box
67 //      // etc.
68 //    }
69 //
70 class TESS_API StructuredTable {
71 public:
72   StructuredTable();
73   ~StructuredTable() = default;
74 
75   // Initialization code. Must be called after the constructor.
76   void Init();
77 
78   // Sets the grids used by the table. These can be changed between
79   // calls to Recognize. They are treated as read-only data.
80   void set_text_grid(ColPartitionGrid *text);
81   void set_line_grid(ColPartitionGrid *lines);
82   // Filters text partitions that are ridiculously tall to prevent
83   // merging rows.
84   void set_max_text_height(int height);
85 
86   // Basic accessors. Some are treated as attributes despite having indirect
87   // representation.
88   bool is_lined() const;
89   unsigned row_count() const;
90   unsigned column_count() const;
91   unsigned cell_count() const;
92   void set_bounding_box(const TBOX &box);
93   const TBOX &bounding_box() const;
94   int median_cell_height();
95   int median_cell_width();
96   int row_height(unsigned row) const;
97   int column_width(unsigned column) const;
98   int space_above() const;
99   int space_below() const;
100 
101   // Given enough horizontal and vertical lines in a region, create this table
102   // based on the structure given by the lines. Return true if it worked out.
103   // Code assumes the lines exist. It is the caller's responsibility to check
104   // for lines and find an appropriate bounding box.
105   bool FindLinedStructure();
106 
107   // The main subroutine for finding generic table structure. The function
108   // finds the grid structure in the given box. Returns true if a good grid
109   // exists, implying that "this" table is valid.
110   bool FindWhitespacedStructure();
111 
112   ////////
113   //////// Functions to query table info.
114   ////////
115 
116   // Returns true if inserting part into the table does not cause any
117   // cell merges.
118   bool DoesPartitionFit(const ColPartition &part) const;
119   // Checks if a sub-table has multiple data cells filled.
120   int CountFilledCells();
121   int CountFilledCellsInRow(int row);
122   int CountFilledCellsInColumn(int column);
123   int CountFilledCells(unsigned row_start, unsigned row_end, unsigned column_start, unsigned column_end);
124 
125   // Makes sure that at least one cell in a row has substantial area filled.
126   // This can filter out large whitespace caused by growing tables too far
127   // and page numbers.
128   // (currently bugged for some reason).
129   bool VerifyRowFilled(int row);
130   // Finds the filled area in a cell.
131   double CalculateCellFilledPercentage(unsigned row, unsigned column);
132 
133   // Debug display, draws the table in the given color. If the table is not
134   // valid, the table and "best" grid lines are still drawn in the given color.
135   void Display(ScrollView *window, ScrollView::Color color);
136 
137 protected:
138   // Clear the structure information.
139   void ClearStructure();
140 
141   ////////
142   //////// Lined tables
143   ////////
144 
145   // Verifies the lines do not intersect partitions. This happens when
146   // the lines are in column boundaries and extend the full page. As a result,
147   // the grid lines go through column text. The condition is detectable.
148   bool VerifyLinedTableCells();
149 
150   ////////
151   //////// Tables with whitespace
152   ////////
153 
154   // This is the function to change if you want to filter resulting tables
155   // better. Right now it just checks for a minimum cell count and such.
156   // You could add things like maximum number of ColPartitions per cell or
157   // similar.
158   bool VerifyWhitespacedTable();
159   // Find the columns of a table using whitespace.
160   void FindWhitespacedColumns();
161   // Find the rows of a table using whitespace.
162   void FindWhitespacedRows();
163 
164   ////////
165   //////// Functions to provide information about the table.
166   ////////
167 
168   // Calculates the whitespace around the table using the table boundary and
169   // the supplied grids (set_text_grid and set_line_grid).
170   void CalculateMargins();
171   // Update the table margins with the supplied grid. This is
172   // only called by calculate margins to use multiple grid sources.
173   void UpdateMargins(ColPartitionGrid *grid);
174   int FindVerticalMargin(ColPartitionGrid *grid, int start_x, bool decrease) const;
175   int FindHorizontalMargin(ColPartitionGrid *grid, int start_y, bool decrease) const;
176   // Calculates stats on the table, namely the median cell height and width.
177   void CalculateStats();
178 
179   ////////
180   //////// Functions to try to "fix" some table errors.
181   ////////
182 
183   // Given a whitespaced table, this looks for bordering lines that might
184   // be page layout boxes around the table. It is necessary to get the margins
185   // correct on the table. If the lines are not joined, the margins will be
186   // the distance to the line, which is not right.
187   void AbsorbNearbyLines();
188 
189   // Nice utility function for finding partition gaps. You feed it a sorted
190   // list of all of the mins/maxes of the partitions in the table, and it gives
191   // you the gaps (middle). This works for both vertical and horizontal
192   // gaps.
193   //
194   // If you want to allow slight overlap in the division and the partitions,
195   // just scale down the partitions before inserting them in the list.
196   // Likewise, you can force at least some space between partitions.
197   // This trick is how the horizontal partitions are done (since the page
198   // skew could make it hard to find splits in the text).
199   //
200   // As a result, "0 distance" between closest partitions causes a gap.
201   // This is not a programmatic assumption. It is intentional and simplifies
202   // things.
203   //
204   // "max_merged" indicates both the minimum number of stacked partitions
205   // to cause a cell (add 1 to it), and the maximum number of partitions that
206   // a grid line can intersect. For example, if max_merged is 0, then lines
207   // are inserted wherever space exists between partitions. If it is 2,
208   // lines may intersect 2 partitions at most, but you also need at least
209   // 2 partitions to generate a line.
210   static void FindCellSplitLocations(const std::vector<int> &min_list,
211                                      const std::vector<int> &max_list, int max_merged,
212                                      std::vector<int> *locations);
213 
214   ////////
215   //////// Utility function for table queries
216   ////////
217 
218   // Counts the number of ColPartitions that intersect vertical cell
219   // division at this x value. Used by VerifyLinedTable.
220   int CountVerticalIntersections(int x);
221   int CountHorizontalIntersections(int y);
222 
223   // Counts how many text partitions are in this box.
224   int CountPartitions(const TBOX &box);
225 
226   ////////
227   //////// Data members.
228   ////////
229 
230   // Input data, used as read only data to make decisions.
231   ColPartitionGrid *text_grid_; // Text ColPartitions
232   ColPartitionGrid *line_grid_; // Line ColPartitions
233   // Table structure.
234   // bounding box is a convenient external representation.
235   // cell_x_ and cell_y_ indicate the grid lines.
236   TBOX bounding_box_;         // Bounding box
237   std::vector<int> cell_x_; // Locations of vertical divisions (sorted)
238   std::vector<int> cell_y_; // Locations of horizontal divisions (sorted)
239   bool is_lined_;             // Is the table backed up by a line structure
240   // Table margins, set via CalculateMargins
241   int space_above_;
242   int space_below_;
243   int space_left_;
244   int space_right_;
245   int median_cell_height_;
246   int median_cell_width_;
247   // Filters, used to prevent awkward partitions from destroying structure.
248   int max_text_height_;
249 };
250 
251 class TESS_API TableRecognizer {
252 public:
253   TableRecognizer() = default;
254   ~TableRecognizer() = default;
255 
256   // Initialization code. Must be called after the constructor.
257   void Init();
258 
259   ////////
260   //////// Pre-recognize methods to initial table constraints.
261   ////////
262 
263   // Sets the grids used by the table. These can be changed between
264   // calls to Recognize. They are treated as read-only data.
265   void set_text_grid(ColPartitionGrid *text);
266   void set_line_grid(ColPartitionGrid *lines);
267   // Sets some additional constraints on the table.
268   void set_min_height(int height);
269   void set_min_width(int width);
270   // Filters text partitions that are ridiculously tall to prevent
271   // merging rows. Note that "filters" refers to allowing horizontal
272   // cells to slice through them on the premise that they were
273   // merged text rows during previous layout.
274   void set_max_text_height(int height);
275 
276   // Given a guess location, the RecognizeTable function will try to find a
277   // structured grid in the area. On success, it will return a new
278   // StructuredTable (and assumes you will delete it). Otherwise,
279   // nullptr is returned.
280   //
281   // Keep in mind, this may "overgrow" or "undergrow" the size of guess.
282   // Ideally, there is a either a one-to-one correspondence between
283   // the guess and table or no table at all. This is not the best of
284   // assumptions right now, but was made to try to keep things simple in
285   // the first pass.
286   //
287   // If a line structure is available on the page in the given region,
288   // the table will use the linear structure as it is.
289   // Otherwise, it will try to maximize the whitespace around it while keeping
290   // a grid structure. This is somewhat working.
291   //
292   // Since the combination of adjustments can get high, effort was
293   // originally made to keep the number of adjustments linear in the number
294   // of partitions. The underlying structure finding code used to be
295   // much more complex. I don't know how necessary this constraint is anymore.
296   // The evaluation of a possible table is kept within O(nlogn) in the size of
297   // the table (where size is the number of partitions in the table).
298   // As a result, the algorithm is capable of O(n^2 log n). Depending
299   // on the grid search size, it may be higher.
300   //
301   // Last note: it is possible to just try all partition boundaries at a high
302   // level O(n^4) and do a verification scheme (at least O(nlogn)). If there
303   // area 200 partitions on a page, this could be too costly. Effort could go
304   // into pruning the search, but I opted for something quicker. I'm confident
305   // that the independent adjustments can get similar results and keep the
306   // complextiy down. However, the other approach could work without using
307   // TableFinder at all if it is fast enough.  It comes down to properly
308   // deciding what is a table. The code currently relies on TableFinder's
309   // guess to the location of a table for that.
310   StructuredTable *RecognizeTable(const TBOX &guess_box);
311 
312 protected:
313   ////////
314   //////// Lined tables
315   ////////
316 
317   // Returns true if the given box has a lined table within it. The
318   // table argument will be updated with the table if the table exists.
319   bool RecognizeLinedTable(const TBOX &guess_box, StructuredTable *table);
320   // Returns true if the given box has a large number of horizontal and
321   // vertical lines present. If so, we assume the extent of these lines
322   // uniquely defines a table and find that table via SolveLinedTable.
323   bool HasSignificantLines(const TBOX &guess);
324 
325   // Given enough horizontal and vertical lines in a region, find a bounding
326   // box that encloses all of them (as well as newly introduced lines).
327   // The bounding box is the smallest box that encloses the lines in guess
328   // without having any lines sticking out of it.
329   // bounding_box is an in/out parameter.
330   // On input, it in the extents of the box to search.
331   // On output, it is the resulting bounding box.
332   bool FindLinesBoundingBox(TBOX *bounding_box);
333   // Iteration in above search.
334   // bounding_box is an in/out parameter.
335   // On input, it in the extents of the box to search.
336   // On output, it is the resulting bounding box.
337   bool FindLinesBoundingBoxIteration(TBOX *bounding_box);
338 
339   ////////
340   //////// Generic "whitespaced" tables
341   ////////
342 
343   // Returns true if the given box has a whitespaced table within it. The
344   // table argument will be updated if the table exists. Also note
345   // that this method will fail if the guess_box center is not
346   // mostly within the table.
347   bool RecognizeWhitespacedTable(const TBOX &guess_box, StructuredTable *table);
348 
349   // Finds the location of a horizontal split relative to y.
350   // This function is mostly unused now. If the SolveWhitespacedTable
351   // changes much, it can be removed. Note, it isn't really as reliable
352   // as I thought. I went with alternatives for most of the other uses.
353   int NextHorizontalSplit(int left, int right, int y, bool top_to_bottom);
354 
355   // Indicates that a table row is weak. This means that it has
356   // many missing data cells or very large cell heights compared.
357   // to the rest of the table.
358   static bool IsWeakTableRow(StructuredTable *table, int row);
359 
360   // Input data, used as read only data to make decisions.
361   ColPartitionGrid *text_grid_ = nullptr; // Text ColPartitions
362   ColPartitionGrid *line_grid_ = nullptr; // Line ColPartitions
363   // Table constraints, a "good" table must satisfy these.
364   int min_height_ = 0;
365   int min_width_ = 0;
366   // Filters, used to prevent awkward partitions from destroying structure.
367   int max_text_height_ = INT32_MAX; // Horizontal lines may intersect taller text.
368 };
369 
370 } // namespace tesseract
371 
372 #endif /* TABLERECOG_H_ */
373