1 /**********************************************************************
2  * File:        pagesegmain.cpp
3  * Description: Top-level page segmenter for Tesseract.
4  * Author:      Ray Smith
5  *
6  * (C) Copyright 2008, Google Inc.
7  ** Licensed under the Apache License, Version 2.0 (the "License");
8  ** you may not use this file except in compliance with the License.
9  ** You may obtain a copy of the License at
10  ** http://www.apache.org/licenses/LICENSE-2.0
11  ** Unless required by applicable law or agreed to in writing, software
12  ** distributed under the License is distributed on an "AS IS" BASIS,
13  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  ** See the License for the specific language governing permissions and
15  ** limitations under the License.
16  *
17  **********************************************************************/
18 
19 #ifdef _WIN32
20 #  ifndef unlink
21 #    include <io.h>
22 #  endif
23 #else
24 #  include <unistd.h>
25 #endif // _WIN32
26 
27 // Include automatically generated configuration file if running autoconf.
28 #ifdef HAVE_CONFIG_H
29 #  include "config_auto.h"
30 #endif
31 
32 #include <allheaders.h>
33 #include "blobbox.h"
34 #include "blread.h"
35 #include "colfind.h"
36 #include "debugpixa.h"
37 #ifndef DISABLED_LEGACY_ENGINE
38 #  include "equationdetect.h"
39 #endif
40 #include <tesseract/osdetect.h>
41 #include "imagefind.h"
42 #include "linefind.h"
43 #include "makerow.h"
44 #include "tabvector.h"
45 #include "tesseractclass.h"
46 #include "tessvars.h"
47 #include "textord.h"
48 #include "tordmain.h"
49 #include "wordseg.h"
50 
51 namespace tesseract {
52 
53 // Max erosions to perform in removing an enclosing circle.
54 const int kMaxCircleErosions = 8;
55 
56 // Helper to remove an enclosing circle from an image.
57 // If there isn't one, then the image will most likely get badly mangled.
58 // The returned pix must be pixDestroyed after use. nullptr may be returned
59 // if the image doesn't meet the trivial conditions that it uses to determine
60 // success.
RemoveEnclosingCircle(Image pixs)61 static Image RemoveEnclosingCircle(Image pixs) {
62   Image pixsi = pixInvert(nullptr, pixs);
63   Image pixc = pixCreateTemplate(pixs);
64   pixSetOrClearBorder(pixc, 1, 1, 1, 1, PIX_SET);
65   pixSeedfillBinary(pixc, pixc, pixsi, 4);
66   pixInvert(pixc, pixc);
67   pixsi.destroy();
68   Image pixt = pixs & pixc;
69   l_int32 max_count;
70   pixCountConnComp(pixt, 8, &max_count);
71   // The count has to go up before we start looking for the minimum.
72   l_int32 min_count = INT32_MAX;
73   Image pixout = nullptr;
74   for (int i = 1; i < kMaxCircleErosions; i++) {
75     pixt.destroy();
76     pixErodeBrick(pixc, pixc, 3, 3);
77     pixt = pixs & pixc;
78     l_int32 count;
79     pixCountConnComp(pixt, 8, &count);
80     if (i == 1 || count > max_count) {
81       max_count = count;
82       min_count = count;
83     } else if (count < min_count) {
84       min_count = count;
85       pixout.destroy();
86       pixout = pixt.copy(); // Save the best.
87     } else if (count >= min_count) {
88       break; // We have passed by the best.
89     }
90   }
91   pixt.destroy();
92   pixc.destroy();
93   return pixout;
94 }
95 
96 /**
97  * Segment the page according to the current value of tessedit_pageseg_mode.
98  * pix_binary_ is used as the source image and should not be nullptr.
99  * On return the blocks list owns all the constructed page layout.
100  */
SegmentPage(const char * input_file,BLOCK_LIST * blocks,Tesseract * osd_tess,OSResults * osr)101 int Tesseract::SegmentPage(const char *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess,
102                            OSResults *osr) {
103   ASSERT_HOST(pix_binary_ != nullptr);
104   int width = pixGetWidth(pix_binary_);
105   int height = pixGetHeight(pix_binary_);
106   // Get page segmentation mode.
107   auto pageseg_mode = static_cast<PageSegMode>(static_cast<int>(tessedit_pageseg_mode));
108   // If a UNLV zone file can be found, use that instead of segmentation.
109   if (!PSM_COL_FIND_ENABLED(pageseg_mode) && input_file != nullptr && input_file[0] != '\0') {
110     std::string name = input_file;
111     const char *lastdot = strrchr(name.c_str(), '.');
112     if (lastdot != nullptr) {
113       name[lastdot - name.c_str()] = '\0';
114     }
115     read_unlv_file(name, width, height, blocks);
116   }
117   if (blocks->empty()) {
118     // No UNLV file present. Work according to the PageSegMode.
119     // First make a single block covering the whole image.
120     BLOCK_IT block_it(blocks);
121     auto *block = new BLOCK("", true, 0, 0, 0, 0, width, height);
122     block->set_right_to_left(right_to_left());
123     block_it.add_to_end(block);
124   } else {
125     // UNLV file present. Use PSM_SINGLE_BLOCK.
126     pageseg_mode = PSM_SINGLE_BLOCK;
127   }
128   // The diacritic_blobs holds noise blobs that may be diacritics. They
129   // are separated out on areas of the image that seem noisy and short-circuit
130   // the layout process, going straight from the initial partition creation
131   // right through to after word segmentation, where they are added to the
132   // rej_cblobs list of the most appropriate word. From there classification
133   // will determine whether they are used.
134   BLOBNBOX_LIST diacritic_blobs;
135   int auto_page_seg_ret_val = 0;
136   TO_BLOCK_LIST to_blocks;
137   if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
138       PSM_SPARSE(pageseg_mode)) {
139     auto_page_seg_ret_val =
140         AutoPageSeg(pageseg_mode, blocks, &to_blocks,
141                     enable_noise_removal ? &diacritic_blobs : nullptr, osd_tess, osr);
142     if (pageseg_mode == PSM_OSD_ONLY) {
143       return auto_page_seg_ret_val;
144     }
145     // To create blobs from the image region bounds uncomment this line:
146     //  to_blocks.clear();  // Uncomment to go back to the old mode.
147   } else {
148     deskew_ = FCOORD(1.0f, 0.0f);
149     reskew_ = FCOORD(1.0f, 0.0f);
150     if (pageseg_mode == PSM_CIRCLE_WORD) {
151       Image pixcleaned = RemoveEnclosingCircle(pix_binary_);
152       if (pixcleaned != nullptr) {
153         pix_binary_.destroy();
154         pix_binary_ = pixcleaned;
155       }
156     }
157   }
158 
159   if (auto_page_seg_ret_val < 0) {
160     return -1;
161   }
162 
163   if (blocks->empty()) {
164     if (textord_debug_tabfind) {
165       tprintf("Empty page\n");
166     }
167     return 0; // AutoPageSeg found an empty page.
168   }
169   bool splitting = pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
170   bool cjk_mode = textord_use_cjk_fp_model;
171 
172   textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_,
173                        pix_grey_, splitting || cjk_mode, &diacritic_blobs, blocks, &to_blocks);
174   return auto_page_seg_ret_val;
175 }
176 
177 /**
178  * Auto page segmentation. Divide the page image into blocks of uniform
179  * text linespacing and images.
180  *
181  * Resolution (in ppi) is derived from the input image.
182  *
183  * The output goes in the blocks list with corresponding TO_BLOCKs in the
184  * to_blocks list.
185  *
186  * If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
187  * the image into columns, but multiple blocks are still made if the text is
188  * of non-uniform linespacing.
189  *
190  * If diacritic_blobs is non-null, then diacritics/noise blobs, that would
191  * confuse layout analysis by causing textline overlap, are placed there,
192  * with the expectation that they will be reassigned to words later and
193  * noise/diacriticness determined via classification.
194  *
195  * If osd (orientation and script detection) is true then that is performed
196  * as well. If only_osd is true, then only orientation and script detection is
197  * performed. If osd is desired, (osd or only_osd) then osr_tess must be
198  * another Tesseract that was initialized especially for osd, and the results
199  * will be output into osr (orientation and script result).
200  */
AutoPageSeg(PageSegMode pageseg_mode,BLOCK_LIST * blocks,TO_BLOCK_LIST * to_blocks,BLOBNBOX_LIST * diacritic_blobs,Tesseract * osd_tess,OSResults * osr)201 int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks,
202                            BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr) {
203   Image photomask_pix = nullptr;
204   Image musicmask_pix = nullptr;
205   // The blocks made by the ColumnFinder. Moved to blocks before return.
206   BLOCK_LIST found_blocks;
207   TO_BLOCK_LIST temp_blocks;
208 
209   ColumnFinder *finder = SetupPageSegAndDetectOrientation(
210       pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
211       pageseg_apply_music_mask ? &musicmask_pix : nullptr);
212   int result = 0;
213   if (finder != nullptr) {
214     TO_BLOCK_IT to_block_it(&temp_blocks);
215     TO_BLOCK *to_block = to_block_it.data();
216     if (musicmask_pix != nullptr) {
217       // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
218       // blocks separately. For now combine with photomask_pix.
219       photomask_pix |= musicmask_pix;
220     }
221 #ifndef DISABLED_LEGACY_ENGINE
222     if (equ_detect_) {
223       finder->SetEquationDetect(equ_detect_);
224     }
225 #endif // ndef DISABLED_LEGACY_ENGINE
226     result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, to_block,
227                                 photomask_pix, pix_thresholds_, pix_grey_, &pixa_debug_,
228                                 &found_blocks, diacritic_blobs, to_blocks);
229     if (result >= 0) {
230       finder->GetDeskewVectors(&deskew_, &reskew_);
231     }
232     delete finder;
233   }
234   photomask_pix.destroy();
235   musicmask_pix.destroy();
236   if (result < 0) {
237     return result;
238   }
239 
240   blocks->clear();
241   BLOCK_IT block_it(blocks);
242   // Move the found blocks to the input/output blocks.
243   block_it.add_list_after(&found_blocks);
244   return result;
245 }
246 
247 // Helper adds all the scripts from sid_set converted to ids from osd_set to
248 // allowed_ids.
AddAllScriptsConverted(const UNICHARSET & sid_set,const UNICHARSET & osd_set,std::vector<int> * allowed_ids)249 static void AddAllScriptsConverted(const UNICHARSET &sid_set, const UNICHARSET &osd_set,
250                                    std::vector<int> *allowed_ids) {
251   for (int i = 0; i < sid_set.get_script_table_size(); ++i) {
252     if (i != sid_set.null_sid()) {
253       const char *script = sid_set.get_script_from_script_id(i);
254       allowed_ids->push_back(osd_set.get_script_id_from_name(script));
255     }
256   }
257 }
258 
259 /**
260  * Sets up auto page segmentation, determines the orientation, and corrects it.
261  * Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to
262  * facilitate testing.
263  * photo_mask_pix is a pointer to a nullptr pointer that will be filled on
264  * return with the leptonica photo mask, which must be pixDestroyed by the
265  * caller. to_blocks is an empty list that will be filled with (usually a
266  * single) block that is used during layout analysis. This ugly API is required
267  * because of the possibility of a unlv zone file.
268  * TODO(rays) clean this up.
269  * See AutoPageSeg for other arguments.
270  * The returned ColumnFinder must be deleted after use.
271  */
SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode,BLOCK_LIST * blocks,Tesseract * osd_tess,OSResults * osr,TO_BLOCK_LIST * to_blocks,Image * photo_mask_pix,Image * music_mask_pix)272 ColumnFinder *Tesseract::SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode,
273                                                           BLOCK_LIST *blocks, Tesseract *osd_tess,
274                                                           OSResults *osr, TO_BLOCK_LIST *to_blocks,
275                                                           Image *photo_mask_pix,
276                                                           Image *music_mask_pix) {
277   int vertical_x = 0;
278   int vertical_y = 1;
279   TabVector_LIST v_lines;
280   TabVector_LIST h_lines;
281   ICOORD bleft(0, 0);
282 
283   ASSERT_HOST(pix_binary_ != nullptr);
284   if (tessedit_dump_pageseg_images) {
285     pixa_debug_.AddPix(pix_binary_, "PageSegInput");
286   }
287   // Leptonica is used to find the rule/separator lines in the input.
288   LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_,
289                                  &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines);
290   if (tessedit_dump_pageseg_images) {
291     pixa_debug_.AddPix(pix_binary_, "NoLines");
292   }
293   // Leptonica is used to find a mask of the photo regions in the input.
294   *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
295   if (tessedit_dump_pageseg_images) {
296     Image pix_no_image_ = nullptr;
297     if (*photo_mask_pix != nullptr) {
298       pix_no_image_ = pixSubtract(nullptr, pix_binary_, *photo_mask_pix);
299     } else {
300       pix_no_image_ = pix_binary_.clone();
301     }
302     pixa_debug_.AddPix(pix_no_image_, "NoImages");
303     pix_no_image_.destroy();
304   }
305   if (!PSM_COL_FIND_ENABLED(pageseg_mode)) {
306     v_lines.clear();
307   }
308 
309   // The rest of the algorithm uses the usual connected components.
310   textord_.find_components(pix_binary_, blocks, to_blocks);
311 
312   TO_BLOCK_IT to_block_it(to_blocks);
313   // There must be exactly one input block.
314   // TODO(rays) handle new textline finding with a UNLV zone file.
315   ASSERT_HOST(to_blocks->singleton());
316   TO_BLOCK *to_block = to_block_it.data();
317   TBOX blkbox = to_block->block->pdblk.bounding_box();
318   ColumnFinder *finder = nullptr;
319   int estimated_resolution = source_resolution_;
320   if (source_resolution_ == kMinCredibleResolution) {
321     // Try to estimate resolution from typical body text size.
322     int res = IntCastRounded(to_block->line_size * kResolutionEstimationFactor);
323     if (res > estimated_resolution && res < kMaxCredibleResolution) {
324       estimated_resolution = res;
325       tprintf("Estimating resolution as %d\n", estimated_resolution);
326     }
327   }
328 
329   if (to_block->line_size >= 2) {
330     finder = new ColumnFinder(static_cast<int>(to_block->line_size), blkbox.botleft(),
331                               blkbox.topright(), estimated_resolution, textord_use_cjk_fp_model,
332                               textord_tabfind_aligned_gap_fraction, &v_lines, &h_lines, vertical_x,
333                               vertical_y);
334 
335     finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
336 
337 #ifndef DISABLED_LEGACY_ENGINE
338 
339     if (equ_detect_) {
340       equ_detect_->LabelSpecialText(to_block);
341     }
342 
343     BLOBNBOX_CLIST osd_blobs;
344     // osd_orientation is the number of 90 degree rotations to make the
345     // characters upright. (See tesseract/osdetect.h for precise definition.)
346     // We want the text lines horizontal, (vertical text indicates vertical
347     // textlines) which may conflict (eg vertically written CJK).
348     int osd_orientation = 0;
349     bool vertical_text =
350         textord_tabfind_force_vertical_text || pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
351     if (!vertical_text && textord_tabfind_vertical_text && PSM_ORIENTATION_ENABLED(pageseg_mode)) {
352       vertical_text = finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio, to_block,
353                                                       &osd_blobs);
354     }
355     if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != nullptr && osr != nullptr) {
356       std::vector<int> osd_scripts;
357       if (osd_tess != this) {
358         // We are running osd as part of layout analysis, so constrain the
359         // scripts to those allowed by *this.
360         AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
361         for (auto &lang : sub_langs_) {
362           AddAllScriptsConverted(lang->unicharset, osd_tess->unicharset, &osd_scripts);
363         }
364       }
365       os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
366       if (pageseg_mode == PSM_OSD_ONLY) {
367         delete finder;
368         return nullptr;
369       }
370       osd_orientation = osr->best_result.orientation_id;
371       double osd_score = osr->orientations[osd_orientation];
372       double osd_margin = min_orientation_margin * 2;
373       for (int i = 0; i < 4; ++i) {
374         if (i != osd_orientation && osd_score - osr->orientations[i] < osd_margin) {
375           osd_margin = osd_score - osr->orientations[i];
376         }
377       }
378       int best_script_id = osr->best_result.script_id;
379       const char *best_script_str = osd_tess->unicharset.get_script_from_script_id(best_script_id);
380       bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
381                  best_script_id == osd_tess->unicharset.hiragana_sid() ||
382                  best_script_id == osd_tess->unicharset.katakana_sid() ||
383                  strcmp("Japanese", best_script_str) == 0 ||
384                  strcmp("Korean", best_script_str) == 0 || strcmp("Hangul", best_script_str) == 0;
385       if (cjk) {
386         finder->set_cjk_script(true);
387       }
388       if (osd_margin < min_orientation_margin) {
389         // The margin is weak.
390         if (!cjk && !vertical_text && osd_orientation == 2) {
391           // upside down latin text is improbable with such a weak margin.
392           tprintf(
393               "OSD: Weak margin (%.2f), horiz textlines, not CJK: "
394               "Don't rotate.\n",
395               osd_margin);
396           osd_orientation = 0;
397         } else {
398           tprintf(
399               "OSD: Weak margin (%.2f) for %d blob text block, "
400               "but using orientation anyway: %d\n",
401               osd_margin, osd_blobs.length(), osd_orientation);
402         }
403       }
404     }
405     osd_blobs.shallow_clear();
406     finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
407 
408 #endif // ndef DISABLED_LEGACY_ENGINE
409   }
410 
411   return finder;
412 }
413 
414 } // namespace tesseract.
415