1 ///////////////////////////////////////////////////////////////////////
2 // File:        colfind.h
3 // Description: Class to find columns in the grid of BLOBNBOXes.
4 // Author:      Ray Smith
5 //
6 // (C) Copyright 2008, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TESSERACT_TEXTORD_COLFIND_H_
20 #define TESSERACT_TEXTORD_COLFIND_H_
21 
22 #include "colpartitiongrid.h"
23 #include "colpartitionset.h"
24 #include "debugpixa.h"
25 #include "imagefind.h"
26 #include "ocrblock.h"
27 #include "tabfind.h"
28 #include "textlineprojection.h"
29 
30 class BLOCK_LIST;
31 struct Boxa;
32 struct Pixa;
33 class DENORM;
34 class ScrollView;
35 class STATS;
36 class TO_BLOCK;
37 
38 namespace tesseract {
39 
40 class ColPartitionSet;
41 class ColPartitionSet_LIST;
42 class ColSegment_LIST;
43 class ColumnGroup_LIST;
44 class LineSpacing;
45 class StrokeWidth;
46 class TempColumn_LIST;
47 class EquationDetectBase;
48 
49 // The ColumnFinder class finds columns in the grid.
50 class TESS_API ColumnFinder : public TabFind {
51 public:
52   // Gridsize is an estimate of the text size in the image. A suitable value
53   // is in TO_BLOCK::line_size after find_components has been used to make
54   // the blobs.
55   // bleft and tright are the bounds of the image (rectangle) being processed.
56   // vlines is a (possibly empty) list of TabVector and vertical_x and y are
57   // the sum logical vertical vector produced by LineFinder::FindVerticalLines.
58   // If cjk_script is true, then broken CJK characters are fixed during
59   // layout analysis to assist in detecting horizontal vs vertically written
60   // textlines.
61   ColumnFinder(int gridsize, const ICOORD &bleft, const ICOORD &tright, int resolution,
62                bool cjk_script, double aligned_gap_fraction, TabVector_LIST *vlines,
63                TabVector_LIST *hlines, int vertical_x, int vertical_y);
64   ~ColumnFinder() override;
65 
66   // Accessors for testing
denorm()67   const DENORM *denorm() const {
68     return denorm_;
69   }
projection()70   const TextlineProjection *projection() const {
71     return &projection_;
72   }
set_cjk_script(bool is_cjk)73   void set_cjk_script(bool is_cjk) {
74     cjk_script_ = is_cjk;
75   }
76 
77   // ======================================================================
78   // The main function of ColumnFinder is broken into pieces to facilitate
79   // optional insertion of orientation and script detection in an efficient
80   // way. The calling sequence IS MANDATORY however, whether or not
81   // OSD is being used:
82   // 1. Construction.
83   // 2. SetupAndFilterNoise.
84   // 3. IsVerticallyAlignedText.
85   // 4. CorrectOrientation.
86   // 5. FindBlocks.
87   // 6. Destruction. Use of a single column finder for multiple images does not
88   //    make sense.
89   // Throughout these steps, the ColPartitions are owned by part_grid_, which
90   // means that that it must be kept correct. Exception: big_parts_ owns its
91   // own ColPartitions.
92   // The BLOBNBOXes are owned by the input TO_BLOCK for the whole time, except
93   // for a phase in FindBlocks before TransformToBlocks, when they become
94   // owned by the ColPartitions. The owner() ColPartition of a BLOBNBOX
95   // indicates more of a betrothal for the majority of layout analysis, ie
96   // which ColPartition will take ownership when the blobs are release from
97   // the input TO_BLOCK. Exception: image_bblobs_ owns the fake blobs that
98   // are part of the image regions, as they are not on any TO_BLOCK list.
99   // TODO(rays) break up column finder further into smaller classes, as
100   // there is a lot more to it than column finding now.
101   // ======================================================================
102 
103   // Performs initial processing on the blobs in the input_block:
104   // Setup the part_grid, stroke_width_, nontext_map_.
105   // Obvious noise blobs are filtered out and used to mark the nontext_map_.
106   // Initial stroke-width analysis is used to get local text alignment
107   // direction, so the textline projection_ map can be setup.
108   // On return, IsVerticallyAlignedText may be called (now optionally) to
109   // determine the gross textline alignment of the page.
110   void SetupAndFilterNoise(PageSegMode pageseg_mode, Image photo_mask_pix, TO_BLOCK *input_block);
111 
112   // Tests for vertical alignment of text (returning true if so), and generates
113   // a list of blobs (in osd_blobs) for orientation and script detection.
114   // block is the single block for the whole page or rectangle to be OCRed.
115   // Note that the vertical alignment may be due to text whose writing direction
116   // is vertical, like say Japanese, or due to text whose writing direction is
117   // horizontal but whose text appears vertically aligned because the image is
118   // not the right way up.
119   // find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio.
120   bool IsVerticallyAlignedText(double find_vertical_text_ratio, TO_BLOCK *block,
121                                BLOBNBOX_CLIST *osd_blobs);
122 
123   // Rotates the blobs and the TabVectors so that the gross writing direction
124   // (text lines) are horizontal and lines are read down the page.
125   // Applied rotation stored in rotation_.
126   // A second rotation is calculated for application during recognition to
127   // make the rotated blobs upright for recognition.
128   // Subsequent rotation stored in text_rotation_.
129   //
130   // Arguments:
131   //   vertical_text_lines is true if the text lines are vertical.
132   //   recognition_rotation [0..3] is the number of anti-clockwise 90 degree
133   //   rotations from osd required for the text to be upright and readable.
134   void CorrectOrientation(TO_BLOCK *block, bool vertical_text_lines, int recognition_rotation);
135 
136   // Finds blocks of text, image, rule line, table etc, returning them in the
137   // blocks and to_blocks
138   // (Each TO_BLOCK points to the basic BLOCK and adds more information.)
139   // Image blocks are generated by a combination of photo_mask_pix (which may
140   // NOT be nullptr) and the rejected text found during preliminary textline
141   // finding.
142   // The input_block is the result of a call to find_components, and contains
143   // the blobs found in the image or rectangle to be OCRed. These blobs will be
144   // removed and placed in the output blocks, while unused ones will be deleted.
145   // If single_column is true, the input is treated as single column, but
146   // it is still divided into blocks of equal line spacing/text size.
147   // scaled_color is scaled down by scaled_factor from the input color image,
148   // and may be nullptr if the input was not color.
149   // grey_pix is optional, but if present must match the photo_mask_pix in size,
150   // and must be a *real* grey image instead of binary_pix * 255.
151   // thresholds_pix is expected to be present iff grey_pix is present and
152   // can be an integer factor reduction of the grey_pix. It represents the
153   // thresholds that were used to create the binary_pix from the grey_pix.
154   // Small blobs that confuse the segmentation into lines are placed into
155   // diacritic_blobs, with the intention that they be put into the most
156   // appropriate word after the rest of layout analysis.
157   // Returns -1 if the user hits the 'd' key in the blocks window while running
158   // in debug mode, which requests a retry with more debug info.
159   int FindBlocks(PageSegMode pageseg_mode, Image scaled_color, int scaled_factor, TO_BLOCK *block,
160                  Image photo_mask_pix, Image thresholds_pix, Image grey_pix, DebugPixa *pixa_debug,
161                  BLOCK_LIST *blocks, BLOBNBOX_LIST *diacritic_blobs, TO_BLOCK_LIST *to_blocks);
162 
163   // Get the rotation required to deskew, and its inverse rotation.
164   void GetDeskewVectors(FCOORD *deskew, FCOORD *reskew);
165 
166   // Set the equation detection pointer.
167   void SetEquationDetect(EquationDetectBase *detect);
168 
169 private:
170   // Displays the blob and block bounding boxes in a window called Blocks.
171   void DisplayBlocks(BLOCK_LIST *blocks);
172   // Displays the column edges at each grid y coordinate defined by
173   // best_columns_.
174   void DisplayColumnBounds(PartSetVector *sets);
175 
176   ////// Functions involved in determining the columns used on the page. /////
177 
178   // Sets up column_sets_ (the determined column layout at each horizontal
179   // slice). Returns false if the page is empty.
180   bool MakeColumns(bool single_column);
181   // Attempt to improve the column_candidates by expanding the columns
182   // and adding new partitions from the partition sets in src_sets.
183   // Src_sets may be equal to column_candidates, in which case it will
184   // use them as a source to improve themselves.
185   void ImproveColumnCandidates(PartSetVector *src_sets, PartSetVector *column_sets);
186   // Prints debug information on the column candidates.
187   void PrintColumnCandidates(const char *title);
188   // Finds the optimal set of columns that cover the entire image with as
189   // few changes in column partition as possible.
190   // Returns true if any part of the page is multi-column.
191   bool AssignColumns(const PartSetVector &part_sets);
192   // Finds the biggest range in part_sets_ that has no assigned column, but
193   // column assignment is possible.
194   bool BiggestUnassignedRange(int set_count, const bool *any_columns_possible, int *start,
195                               int *end);
196   // Finds the modal compatible column_set_ index within the given range.
197   int RangeModalColumnSet(int **column_set_costs, const int *assigned_costs, int start, int end);
198   // Given that there are many column_set_id compatible columns in the range,
199   // shrinks the range to the longest contiguous run of compatibility, allowing
200   // gaps where no columns are possible, but not where competing columns are
201   // possible.
202   void ShrinkRangeToLongestRun(int **column_set_costs, const int *assigned_costs,
203                                const bool *any_columns_possible, int column_set_id, int *best_start,
204                                int *best_end);
205   // Moves start in the direction of step, up to, but not including end while
206   // the only incompatible regions are no more than kMaxIncompatibleColumnCount
207   // in size, and the compatible regions beyond are bigger.
208   void ExtendRangePastSmallGaps(int **column_set_costs, const int *assigned_costs,
209                                 const bool *any_columns_possible, int column_set_id, int step,
210                                 int end, int *start);
211   // Assigns the given column_set_id to the part_sets_ in the given range.
212   void AssignColumnToRange(int column_set_id, int start, int end, int **column_set_costs,
213                            int *assigned_costs);
214 
215   // Computes the mean_column_gap_.
216   void ComputeMeanColumnGap(bool any_multi_column);
217 
218   //////// Functions that manipulate ColPartitions in the part_grid_ /////
219   //////// to split, merge, find margins, and find types.  //////////////
220 
221   // Hoovers up all un-owned blobs and deletes them.
222   // The rest get released from the block so the ColPartitions can pass
223   // ownership to the output blocks.
224   void ReleaseBlobsAndCleanupUnused(TO_BLOCK *block);
225   // Splits partitions that cross columns where they have nothing in the gap.
226   void GridSplitPartitions();
227   // Merges partitions where there is vertical overlap, within a single column,
228   // and the horizontal gap is small enough.
229   void GridMergePartitions();
230   // Inserts remaining noise blobs into the most applicable partition if any.
231   // If there is no applicable partition, then the blobs are deleted.
232   void InsertRemainingNoise(TO_BLOCK *block);
233   // Remove partitions that come from horizontal lines that look like
234   // underlines, but are not part of a table.
235   void GridRemoveUnderlinePartitions();
236   // Add horizontal line separators as partitions.
237   void GridInsertHLinePartitions();
238   // Add vertical line separators as partitions.
239   void GridInsertVLinePartitions();
240   // For every ColPartition in the grid, sets its type based on position
241   // in the columns.
242   void SetPartitionTypes();
243   // Only images remain with multiple types in a run of partners.
244   // Sets the type of all in the group to the maximum of the group.
245   void SmoothPartnerRuns();
246 
247   //////// Functions that make the final output blocks             ///////
248 
249   // Helper functions for TransformToBlocks.
250   // Add the part to the temp list in the correct order.
251   void AddToTempPartList(ColPartition *part, ColPartition_CLIST *temp_list);
252   // Add everything from the temp list to the work_set assuming correct order.
253   void EmptyTempPartList(ColPartition_CLIST *temp_list, WorkingPartSet_LIST *work_set);
254 
255   // Transform the grid of partitions to the output blocks.
256   void TransformToBlocks(BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks);
257 
258   // Reflect the blob boxes (but not the outlines) in the y-axis so that
259   // the blocks get created in the correct RTL order. Rotates the blobs
260   // in the input_block and the bblobs list.
261   // The reflection is undone in RotateAndReskewBlocks by
262   // reflecting the blocks themselves, and then recomputing the blob bounding
263   //  boxes.
264   void ReflectForRtl(TO_BLOCK *input_block, BLOBNBOX_LIST *bblobs);
265 
266   // Undo the deskew that was done in FindTabVectors, as recognition is done
267   // without correcting blobs or blob outlines for skew.
268   // Reskew the completed blocks to put them back to the original rotated coords
269   // that were created by CorrectOrientation.
270   // If the input_is_rtl, then reflect the blocks in the y-axis to undo the
271   // reflection that was done before FindTabVectors.
272   // Blocks that were identified as vertical text (relative to the rotated
273   // coordinates) are further rotated so the text lines are horizontal.
274   // blob polygonal outlines are rotated to match the position of the blocks
275   // that they are in, and their bounding boxes are recalculated to be accurate.
276   // Record appropriate inverse transformations and required
277   // classifier transformation in the blocks.
278   void RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST *to_blocks);
279 
280   // Computes the rotations for the block (to make textlines horizontal) and
281   // for the blobs (for classification) and sets the appropriate members
282   // of the given block.
283   // Returns the rotation that needs to be applied to the blobs to make
284   // them sit in the rotated block.
285   FCOORD ComputeBlockAndClassifyRotation(BLOCK *block);
286 
287   // If true then the page language is cjk, so it is safe to perform
288   // FixBrokenCJK.
289   bool cjk_script_;
290   // The minimum gutter width to apply for finding columns.
291   // Modified when vertical text is detected to prevent detection of
292   // vertical text lines as columns.
293   int min_gutter_width_;
294   // The mean gap between columns over the page.
295   int mean_column_gap_;
296   // Config param saved at construction time. Modifies min_gutter_width_ with
297   // vertical text to prevent detection of vertical text as columns.
298   double tabfind_aligned_gap_fraction_;
299   // The rotation vector needed to convert original coords to deskewed.
300   FCOORD deskew_;
301   // The rotation vector needed to convert deskewed back to original coords.
302   FCOORD reskew_;
303   // The rotation vector used to rotate vertically oriented pages.
304   FCOORD rotation_;
305   // The rotation vector needed to convert the rotated back to original coords.
306   FCOORD rerotate_;
307   // The additional rotation vector needed to rotate text for recognition.
308   FCOORD text_rotation_;
309   // The column_sets_ contain the ordered candidate ColPartitionSets that
310   // define the possible divisions of the page into columns.
311   PartSetVector column_sets_;
312   // A simple array of pointers to the best assigned column division at
313   // each grid y coordinate.
314   ColPartitionSet **best_columns_;
315   // The grid used for creating initial partitions with strokewidth.
316   StrokeWidth *stroke_width_;
317   // The grid used to hold ColPartitions after the columns have been determined.
318   ColPartitionGrid part_grid_;
319   // List of ColPartitions that are no longer needed after they have been
320   // turned into regions, but are kept around because they are referenced
321   // by the part_grid_.
322   ColPartition_LIST good_parts_;
323   // List of ColPartitions that are big and might be dropcap or vertically
324   // joined.
325   ColPartition_LIST big_parts_;
326   // List of ColPartitions that have been declared noise.
327   ColPartition_LIST noise_parts_;
328   // The fake blobs that are made from the images.
329   BLOBNBOX_LIST image_bblobs_;
330   // Horizontal line separators.
331   TabVector_LIST horizontal_lines_;
332   // Image map of photo/noise areas on the page.
333   Image nontext_map_;
334   // Textline projection map.
335   TextlineProjection projection_;
336   // Sequence of DENORMS that indicate how to get back to the original image
337   // coordinate space. The destructor must delete all the DENORMs in the chain.
338   DENORM *denorm_;
339 
340   // The equation region detector pointer. Note: This pointer is passed in by
341   // member function SetEquationDetect, and releasing it is NOT owned by this
342   // class.
343   EquationDetectBase *equation_detect_;
344 
345 #ifndef GRAPHICS_DISABLED
346   // Various debug windows that automatically go away on completion.
347   ScrollView *input_blobs_win_ = nullptr;
348 
349   // Allow a subsequent instance to reuse the blocks window.
350   // Not thread-safe, but multiple threads shouldn't be using windows anyway.
351   static ScrollView *blocks_win_;
352 #endif
353 };
354 
355 } // namespace tesseract.
356 
357 #endif // TESSERACT_TEXTORD_COLFIND_H_
358