1 /////////////////////////////////////////////////////////////////////// 2 // File: colfind.h 3 // Description: Class to find columns in the grid of BLOBNBOXes. 4 // Author: Ray Smith 5 // 6 // (C) Copyright 2008, Google Inc. 7 // Licensed under the Apache License, Version 2.0 (the "License"); 8 // you may not use this file except in compliance with the License. 9 // You may obtain a copy of the License at 10 // http://www.apache.org/licenses/LICENSE-2.0 11 // Unless required by applicable law or agreed to in writing, software 12 // distributed under the License is distributed on an "AS IS" BASIS, 13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 // See the License for the specific language governing permissions and 15 // limitations under the License. 16 // 17 /////////////////////////////////////////////////////////////////////// 18 19 #ifndef TESSERACT_TEXTORD_COLFIND_H_ 20 #define TESSERACT_TEXTORD_COLFIND_H_ 21 22 #include "colpartitiongrid.h" 23 #include "colpartitionset.h" 24 #include "debugpixa.h" 25 #include "imagefind.h" 26 #include "ocrblock.h" 27 #include "tabfind.h" 28 #include "textlineprojection.h" 29 30 class BLOCK_LIST; 31 struct Boxa; 32 struct Pixa; 33 class DENORM; 34 class ScrollView; 35 class STATS; 36 class TO_BLOCK; 37 38 namespace tesseract { 39 40 class ColPartitionSet; 41 class ColPartitionSet_LIST; 42 class ColSegment_LIST; 43 class ColumnGroup_LIST; 44 class LineSpacing; 45 class StrokeWidth; 46 class TempColumn_LIST; 47 class EquationDetectBase; 48 49 // The ColumnFinder class finds columns in the grid. 50 class TESS_API ColumnFinder : public TabFind { 51 public: 52 // Gridsize is an estimate of the text size in the image. A suitable value 53 // is in TO_BLOCK::line_size after find_components has been used to make 54 // the blobs. 55 // bleft and tright are the bounds of the image (rectangle) being processed. 56 // vlines is a (possibly empty) list of TabVector and vertical_x and y are 57 // the sum logical vertical vector produced by LineFinder::FindVerticalLines. 58 // If cjk_script is true, then broken CJK characters are fixed during 59 // layout analysis to assist in detecting horizontal vs vertically written 60 // textlines. 61 ColumnFinder(int gridsize, const ICOORD &bleft, const ICOORD &tright, int resolution, 62 bool cjk_script, double aligned_gap_fraction, TabVector_LIST *vlines, 63 TabVector_LIST *hlines, int vertical_x, int vertical_y); 64 ~ColumnFinder() override; 65 66 // Accessors for testing denorm()67 const DENORM *denorm() const { 68 return denorm_; 69 } projection()70 const TextlineProjection *projection() const { 71 return &projection_; 72 } set_cjk_script(bool is_cjk)73 void set_cjk_script(bool is_cjk) { 74 cjk_script_ = is_cjk; 75 } 76 77 // ====================================================================== 78 // The main function of ColumnFinder is broken into pieces to facilitate 79 // optional insertion of orientation and script detection in an efficient 80 // way. The calling sequence IS MANDATORY however, whether or not 81 // OSD is being used: 82 // 1. Construction. 83 // 2. SetupAndFilterNoise. 84 // 3. IsVerticallyAlignedText. 85 // 4. CorrectOrientation. 86 // 5. FindBlocks. 87 // 6. Destruction. Use of a single column finder for multiple images does not 88 // make sense. 89 // Throughout these steps, the ColPartitions are owned by part_grid_, which 90 // means that that it must be kept correct. Exception: big_parts_ owns its 91 // own ColPartitions. 92 // The BLOBNBOXes are owned by the input TO_BLOCK for the whole time, except 93 // for a phase in FindBlocks before TransformToBlocks, when they become 94 // owned by the ColPartitions. The owner() ColPartition of a BLOBNBOX 95 // indicates more of a betrothal for the majority of layout analysis, ie 96 // which ColPartition will take ownership when the blobs are release from 97 // the input TO_BLOCK. Exception: image_bblobs_ owns the fake blobs that 98 // are part of the image regions, as they are not on any TO_BLOCK list. 99 // TODO(rays) break up column finder further into smaller classes, as 100 // there is a lot more to it than column finding now. 101 // ====================================================================== 102 103 // Performs initial processing on the blobs in the input_block: 104 // Setup the part_grid, stroke_width_, nontext_map_. 105 // Obvious noise blobs are filtered out and used to mark the nontext_map_. 106 // Initial stroke-width analysis is used to get local text alignment 107 // direction, so the textline projection_ map can be setup. 108 // On return, IsVerticallyAlignedText may be called (now optionally) to 109 // determine the gross textline alignment of the page. 110 void SetupAndFilterNoise(PageSegMode pageseg_mode, Image photo_mask_pix, TO_BLOCK *input_block); 111 112 // Tests for vertical alignment of text (returning true if so), and generates 113 // a list of blobs (in osd_blobs) for orientation and script detection. 114 // block is the single block for the whole page or rectangle to be OCRed. 115 // Note that the vertical alignment may be due to text whose writing direction 116 // is vertical, like say Japanese, or due to text whose writing direction is 117 // horizontal but whose text appears vertically aligned because the image is 118 // not the right way up. 119 // find_vertical_text_ratio should be textord_tabfind_vertical_text_ratio. 120 bool IsVerticallyAlignedText(double find_vertical_text_ratio, TO_BLOCK *block, 121 BLOBNBOX_CLIST *osd_blobs); 122 123 // Rotates the blobs and the TabVectors so that the gross writing direction 124 // (text lines) are horizontal and lines are read down the page. 125 // Applied rotation stored in rotation_. 126 // A second rotation is calculated for application during recognition to 127 // make the rotated blobs upright for recognition. 128 // Subsequent rotation stored in text_rotation_. 129 // 130 // Arguments: 131 // vertical_text_lines is true if the text lines are vertical. 132 // recognition_rotation [0..3] is the number of anti-clockwise 90 degree 133 // rotations from osd required for the text to be upright and readable. 134 void CorrectOrientation(TO_BLOCK *block, bool vertical_text_lines, int recognition_rotation); 135 136 // Finds blocks of text, image, rule line, table etc, returning them in the 137 // blocks and to_blocks 138 // (Each TO_BLOCK points to the basic BLOCK and adds more information.) 139 // Image blocks are generated by a combination of photo_mask_pix (which may 140 // NOT be nullptr) and the rejected text found during preliminary textline 141 // finding. 142 // The input_block is the result of a call to find_components, and contains 143 // the blobs found in the image or rectangle to be OCRed. These blobs will be 144 // removed and placed in the output blocks, while unused ones will be deleted. 145 // If single_column is true, the input is treated as single column, but 146 // it is still divided into blocks of equal line spacing/text size. 147 // scaled_color is scaled down by scaled_factor from the input color image, 148 // and may be nullptr if the input was not color. 149 // grey_pix is optional, but if present must match the photo_mask_pix in size, 150 // and must be a *real* grey image instead of binary_pix * 255. 151 // thresholds_pix is expected to be present iff grey_pix is present and 152 // can be an integer factor reduction of the grey_pix. It represents the 153 // thresholds that were used to create the binary_pix from the grey_pix. 154 // Small blobs that confuse the segmentation into lines are placed into 155 // diacritic_blobs, with the intention that they be put into the most 156 // appropriate word after the rest of layout analysis. 157 // Returns -1 if the user hits the 'd' key in the blocks window while running 158 // in debug mode, which requests a retry with more debug info. 159 int FindBlocks(PageSegMode pageseg_mode, Image scaled_color, int scaled_factor, TO_BLOCK *block, 160 Image photo_mask_pix, Image thresholds_pix, Image grey_pix, DebugPixa *pixa_debug, 161 BLOCK_LIST *blocks, BLOBNBOX_LIST *diacritic_blobs, TO_BLOCK_LIST *to_blocks); 162 163 // Get the rotation required to deskew, and its inverse rotation. 164 void GetDeskewVectors(FCOORD *deskew, FCOORD *reskew); 165 166 // Set the equation detection pointer. 167 void SetEquationDetect(EquationDetectBase *detect); 168 169 private: 170 // Displays the blob and block bounding boxes in a window called Blocks. 171 void DisplayBlocks(BLOCK_LIST *blocks); 172 // Displays the column edges at each grid y coordinate defined by 173 // best_columns_. 174 void DisplayColumnBounds(PartSetVector *sets); 175 176 ////// Functions involved in determining the columns used on the page. ///// 177 178 // Sets up column_sets_ (the determined column layout at each horizontal 179 // slice). Returns false if the page is empty. 180 bool MakeColumns(bool single_column); 181 // Attempt to improve the column_candidates by expanding the columns 182 // and adding new partitions from the partition sets in src_sets. 183 // Src_sets may be equal to column_candidates, in which case it will 184 // use them as a source to improve themselves. 185 void ImproveColumnCandidates(PartSetVector *src_sets, PartSetVector *column_sets); 186 // Prints debug information on the column candidates. 187 void PrintColumnCandidates(const char *title); 188 // Finds the optimal set of columns that cover the entire image with as 189 // few changes in column partition as possible. 190 // Returns true if any part of the page is multi-column. 191 bool AssignColumns(const PartSetVector &part_sets); 192 // Finds the biggest range in part_sets_ that has no assigned column, but 193 // column assignment is possible. 194 bool BiggestUnassignedRange(int set_count, const bool *any_columns_possible, int *start, 195 int *end); 196 // Finds the modal compatible column_set_ index within the given range. 197 int RangeModalColumnSet(int **column_set_costs, const int *assigned_costs, int start, int end); 198 // Given that there are many column_set_id compatible columns in the range, 199 // shrinks the range to the longest contiguous run of compatibility, allowing 200 // gaps where no columns are possible, but not where competing columns are 201 // possible. 202 void ShrinkRangeToLongestRun(int **column_set_costs, const int *assigned_costs, 203 const bool *any_columns_possible, int column_set_id, int *best_start, 204 int *best_end); 205 // Moves start in the direction of step, up to, but not including end while 206 // the only incompatible regions are no more than kMaxIncompatibleColumnCount 207 // in size, and the compatible regions beyond are bigger. 208 void ExtendRangePastSmallGaps(int **column_set_costs, const int *assigned_costs, 209 const bool *any_columns_possible, int column_set_id, int step, 210 int end, int *start); 211 // Assigns the given column_set_id to the part_sets_ in the given range. 212 void AssignColumnToRange(int column_set_id, int start, int end, int **column_set_costs, 213 int *assigned_costs); 214 215 // Computes the mean_column_gap_. 216 void ComputeMeanColumnGap(bool any_multi_column); 217 218 //////// Functions that manipulate ColPartitions in the part_grid_ ///// 219 //////// to split, merge, find margins, and find types. ////////////// 220 221 // Hoovers up all un-owned blobs and deletes them. 222 // The rest get released from the block so the ColPartitions can pass 223 // ownership to the output blocks. 224 void ReleaseBlobsAndCleanupUnused(TO_BLOCK *block); 225 // Splits partitions that cross columns where they have nothing in the gap. 226 void GridSplitPartitions(); 227 // Merges partitions where there is vertical overlap, within a single column, 228 // and the horizontal gap is small enough. 229 void GridMergePartitions(); 230 // Inserts remaining noise blobs into the most applicable partition if any. 231 // If there is no applicable partition, then the blobs are deleted. 232 void InsertRemainingNoise(TO_BLOCK *block); 233 // Remove partitions that come from horizontal lines that look like 234 // underlines, but are not part of a table. 235 void GridRemoveUnderlinePartitions(); 236 // Add horizontal line separators as partitions. 237 void GridInsertHLinePartitions(); 238 // Add vertical line separators as partitions. 239 void GridInsertVLinePartitions(); 240 // For every ColPartition in the grid, sets its type based on position 241 // in the columns. 242 void SetPartitionTypes(); 243 // Only images remain with multiple types in a run of partners. 244 // Sets the type of all in the group to the maximum of the group. 245 void SmoothPartnerRuns(); 246 247 //////// Functions that make the final output blocks /////// 248 249 // Helper functions for TransformToBlocks. 250 // Add the part to the temp list in the correct order. 251 void AddToTempPartList(ColPartition *part, ColPartition_CLIST *temp_list); 252 // Add everything from the temp list to the work_set assuming correct order. 253 void EmptyTempPartList(ColPartition_CLIST *temp_list, WorkingPartSet_LIST *work_set); 254 255 // Transform the grid of partitions to the output blocks. 256 void TransformToBlocks(BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks); 257 258 // Reflect the blob boxes (but not the outlines) in the y-axis so that 259 // the blocks get created in the correct RTL order. Rotates the blobs 260 // in the input_block and the bblobs list. 261 // The reflection is undone in RotateAndReskewBlocks by 262 // reflecting the blocks themselves, and then recomputing the blob bounding 263 // boxes. 264 void ReflectForRtl(TO_BLOCK *input_block, BLOBNBOX_LIST *bblobs); 265 266 // Undo the deskew that was done in FindTabVectors, as recognition is done 267 // without correcting blobs or blob outlines for skew. 268 // Reskew the completed blocks to put them back to the original rotated coords 269 // that were created by CorrectOrientation. 270 // If the input_is_rtl, then reflect the blocks in the y-axis to undo the 271 // reflection that was done before FindTabVectors. 272 // Blocks that were identified as vertical text (relative to the rotated 273 // coordinates) are further rotated so the text lines are horizontal. 274 // blob polygonal outlines are rotated to match the position of the blocks 275 // that they are in, and their bounding boxes are recalculated to be accurate. 276 // Record appropriate inverse transformations and required 277 // classifier transformation in the blocks. 278 void RotateAndReskewBlocks(bool input_is_rtl, TO_BLOCK_LIST *to_blocks); 279 280 // Computes the rotations for the block (to make textlines horizontal) and 281 // for the blobs (for classification) and sets the appropriate members 282 // of the given block. 283 // Returns the rotation that needs to be applied to the blobs to make 284 // them sit in the rotated block. 285 FCOORD ComputeBlockAndClassifyRotation(BLOCK *block); 286 287 // If true then the page language is cjk, so it is safe to perform 288 // FixBrokenCJK. 289 bool cjk_script_; 290 // The minimum gutter width to apply for finding columns. 291 // Modified when vertical text is detected to prevent detection of 292 // vertical text lines as columns. 293 int min_gutter_width_; 294 // The mean gap between columns over the page. 295 int mean_column_gap_; 296 // Config param saved at construction time. Modifies min_gutter_width_ with 297 // vertical text to prevent detection of vertical text as columns. 298 double tabfind_aligned_gap_fraction_; 299 // The rotation vector needed to convert original coords to deskewed. 300 FCOORD deskew_; 301 // The rotation vector needed to convert deskewed back to original coords. 302 FCOORD reskew_; 303 // The rotation vector used to rotate vertically oriented pages. 304 FCOORD rotation_; 305 // The rotation vector needed to convert the rotated back to original coords. 306 FCOORD rerotate_; 307 // The additional rotation vector needed to rotate text for recognition. 308 FCOORD text_rotation_; 309 // The column_sets_ contain the ordered candidate ColPartitionSets that 310 // define the possible divisions of the page into columns. 311 PartSetVector column_sets_; 312 // A simple array of pointers to the best assigned column division at 313 // each grid y coordinate. 314 ColPartitionSet **best_columns_; 315 // The grid used for creating initial partitions with strokewidth. 316 StrokeWidth *stroke_width_; 317 // The grid used to hold ColPartitions after the columns have been determined. 318 ColPartitionGrid part_grid_; 319 // List of ColPartitions that are no longer needed after they have been 320 // turned into regions, but are kept around because they are referenced 321 // by the part_grid_. 322 ColPartition_LIST good_parts_; 323 // List of ColPartitions that are big and might be dropcap or vertically 324 // joined. 325 ColPartition_LIST big_parts_; 326 // List of ColPartitions that have been declared noise. 327 ColPartition_LIST noise_parts_; 328 // The fake blobs that are made from the images. 329 BLOBNBOX_LIST image_bblobs_; 330 // Horizontal line separators. 331 TabVector_LIST horizontal_lines_; 332 // Image map of photo/noise areas on the page. 333 Image nontext_map_; 334 // Textline projection map. 335 TextlineProjection projection_; 336 // Sequence of DENORMS that indicate how to get back to the original image 337 // coordinate space. The destructor must delete all the DENORMs in the chain. 338 DENORM *denorm_; 339 340 // The equation region detector pointer. Note: This pointer is passed in by 341 // member function SetEquationDetect, and releasing it is NOT owned by this 342 // class. 343 EquationDetectBase *equation_detect_; 344 345 #ifndef GRAPHICS_DISABLED 346 // Various debug windows that automatically go away on completion. 347 ScrollView *input_blobs_win_ = nullptr; 348 349 // Allow a subsequent instance to reuse the blocks window. 350 // Not thread-safe, but multiple threads shouldn't be using windows anyway. 351 static ScrollView *blocks_win_; 352 #endif 353 }; 354 355 } // namespace tesseract. 356 357 #endif // TESSERACT_TEXTORD_COLFIND_H_ 358