1 /**********************************************************************
2  * File:        devanagari_processing.cpp
3  * Description: Methods to process images containing devanagari symbols,
4  *              prior to classification.
5  * Author:      Shobhit Saxena
6  * Created:     Mon Nov 17 20:26:01 IST 2008
7  *
8  * (C) Copyright 2008, Google Inc.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 #ifdef HAVE_CONFIG_H
22 #  include "config_auto.h"
23 #endif
24 
25 #include "devanagari_processing.h"
26 
27 #include "debugpixa.h"
28 #include "statistc.h"
29 #include "tordmain.h"
30 
31 #include <allheaders.h>
32 
33 namespace tesseract {
34 
35 // Flags controlling the debugging information for shiro-rekha splitting
36 // strategies.
37 INT_VAR(devanagari_split_debuglevel, 0, "Debug level for split shiro-rekha process.");
38 
39 BOOL_VAR(devanagari_split_debugimage, 0,
40          "Whether to create a debug image for split shiro-rekha process.");
41 
ShiroRekhaSplitter()42 ShiroRekhaSplitter::ShiroRekhaSplitter() {
43   orig_pix_ = nullptr;
44   segmentation_block_list_ = nullptr;
45   splitted_image_ = nullptr;
46   global_xheight_ = kUnspecifiedXheight;
47   perform_close_ = false;
48   debug_image_ = nullptr;
49   pageseg_split_strategy_ = NO_SPLIT;
50   ocr_split_strategy_ = NO_SPLIT;
51 }
52 
~ShiroRekhaSplitter()53 ShiroRekhaSplitter::~ShiroRekhaSplitter() {
54   Clear();
55 }
56 
Clear()57 void ShiroRekhaSplitter::Clear() {
58   orig_pix_.destroy();
59   splitted_image_.destroy();
60   pageseg_split_strategy_ = NO_SPLIT;
61   ocr_split_strategy_ = NO_SPLIT;
62   debug_image_.destroy();
63   segmentation_block_list_ = nullptr;
64   global_xheight_ = kUnspecifiedXheight;
65   perform_close_ = false;
66 }
67 
68 // On setting the input image, a clone of it is owned by this class.
set_orig_pix(Image pix)69 void ShiroRekhaSplitter::set_orig_pix(Image pix) {
70   if (orig_pix_) {
71     orig_pix_.destroy();
72   }
73   orig_pix_ = pix.clone();
74 }
75 
76 // Top-level method to perform splitting based on current settings.
77 // Returns true if a split was actually performed.
78 // split_for_pageseg should be true if the splitting is being done prior to
79 // page segmentation. This mode uses the flag
80 // pageseg_devanagari_split_strategy to determine the splitting strategy.
Split(bool split_for_pageseg,DebugPixa * pixa_debug)81 bool ShiroRekhaSplitter::Split(bool split_for_pageseg, DebugPixa *pixa_debug) {
82   SplitStrategy split_strategy = split_for_pageseg ? pageseg_split_strategy_ : ocr_split_strategy_;
83   if (split_strategy == NO_SPLIT) {
84     return false; // Nothing to do.
85   }
86   ASSERT_HOST(split_strategy == MINIMAL_SPLIT || split_strategy == MAXIMAL_SPLIT);
87   ASSERT_HOST(orig_pix_);
88   if (devanagari_split_debuglevel > 0) {
89     tprintf("Splitting shiro-rekha ...\n");
90     tprintf("Split strategy = %s\n", split_strategy == MINIMAL_SPLIT ? "Minimal" : "Maximal");
91     tprintf("Initial pageseg available = %s\n", segmentation_block_list_ ? "yes" : "no");
92   }
93   // Create a copy of original image to store the splitting output.
94   splitted_image_.destroy();
95   splitted_image_ = orig_pix_.copy();
96 
97   // Initialize debug image if required.
98   if (devanagari_split_debugimage) {
99     debug_image_.destroy();
100     debug_image_ = pixConvertTo32(orig_pix_);
101   }
102 
103   // Determine all connected components in the input image. A close operation
104   // may be required prior to this, depending on the current settings.
105   Image pix_for_ccs = orig_pix_.clone();
106   if (perform_close_ && global_xheight_ != kUnspecifiedXheight && !segmentation_block_list_) {
107     if (devanagari_split_debuglevel > 0) {
108       tprintf("Performing a global close operation..\n");
109     }
110     // A global measure is available for xheight, but no local information
111     // exists.
112     pix_for_ccs.destroy();
113     pix_for_ccs = orig_pix_.copy();
114     PerformClose(pix_for_ccs, global_xheight_);
115   }
116   Pixa *ccs;
117   Boxa *tmp_boxa = pixConnComp(pix_for_ccs, &ccs, 8);
118   boxaDestroy(&tmp_boxa);
119   pix_for_ccs.destroy();
120 
121   // Iterate over all connected components. Get their bounding boxes and clip
122   // out the image regions corresponding to these boxes from the original image.
123   // Conditionally run splitting on each of them.
124   Boxa *regions_to_clear = boxaCreate(0);
125   int num_ccs = 0;
126   if (ccs != nullptr) {
127     num_ccs = pixaGetCount(ccs);
128   }
129   for (int i = 0; i < num_ccs; ++i) {
130     Box *box = ccs->boxa->box[i];
131     Image word_pix = pixClipRectangle(orig_pix_, box, nullptr);
132     ASSERT_HOST(word_pix);
133     int xheight = GetXheightForCC(box);
134     if (xheight == kUnspecifiedXheight && segmentation_block_list_ && devanagari_split_debugimage) {
135       pixRenderBoxArb(debug_image_, box, 1, 255, 0, 0);
136     }
137     // If some xheight measure is available, attempt to pre-eliminate small
138     // blobs from the shiro-rekha process. This is primarily to save the CCs
139     // corresponding to punctuation marks/small dots etc which are part of
140     // larger graphemes.
141     if (xheight == kUnspecifiedXheight || (box->w > xheight / 3 && box->h > xheight / 2)) {
142       SplitWordShiroRekha(split_strategy, word_pix, xheight, box->x, box->y, regions_to_clear);
143     } else if (devanagari_split_debuglevel > 0) {
144       tprintf("CC dropped from splitting: %d,%d (%d, %d)\n", box->x, box->y, box->w, box->h);
145     }
146     word_pix.destroy();
147   }
148   // Actually clear the boxes now.
149   for (int i = 0; i < boxaGetCount(regions_to_clear); ++i) {
150     Box *box = boxaGetBox(regions_to_clear, i, L_CLONE);
151     pixClearInRect(splitted_image_, box);
152     boxDestroy(&box);
153   }
154   boxaDestroy(&regions_to_clear);
155   pixaDestroy(&ccs);
156   if (devanagari_split_debugimage && pixa_debug != nullptr) {
157     pixa_debug->AddPix(debug_image_, split_for_pageseg ? "pageseg_split" : "ocr_split");
158   }
159   return true;
160 }
161 
162 // Method to perform a close operation on the input image. The xheight
163 // estimate decides the size of sel used.
PerformClose(Image pix,int xheight_estimate)164 void ShiroRekhaSplitter::PerformClose(Image pix, int xheight_estimate) {
165   pixCloseBrick(pix, pix, xheight_estimate / 8, xheight_estimate / 3);
166 }
167 
168 // This method resolves the cc bbox to a particular row and returns the row's
169 // xheight.
GetXheightForCC(Box * cc_bbox)170 int ShiroRekhaSplitter::GetXheightForCC(Box *cc_bbox) {
171   if (!segmentation_block_list_) {
172     return global_xheight_;
173   }
174   // Compute the box coordinates in Tesseract's coordinate system.
175   TBOX bbox(cc_bbox->x, pixGetHeight(orig_pix_) - cc_bbox->y - cc_bbox->h - 1,
176             cc_bbox->x + cc_bbox->w, pixGetHeight(orig_pix_) - cc_bbox->y - 1);
177   // Iterate over all blocks.
178   BLOCK_IT block_it(segmentation_block_list_);
179   for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
180     BLOCK *block = block_it.data();
181     // Iterate over all rows in the block.
182     ROW_IT row_it(block->row_list());
183     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
184       ROW *row = row_it.data();
185       if (!row->bounding_box().major_overlap(bbox)) {
186         continue;
187       }
188       // Row could be skewed, warped, etc. Use the position of the box to
189       // determine the baseline position of the row for that x-coordinate.
190       // Create a square TBOX whose baseline's mid-point lies at this point
191       // and side is row's xheight. Take the overlap of this box with the input
192       // box and check if it is a 'major overlap'. If so, this box lies in this
193       // row. In that case, return the xheight for this row.
194       float box_middle = 0.5 * (bbox.left() + bbox.right());
195       int baseline = static_cast<int>(row->base_line(box_middle) + 0.5);
196       TBOX test_box(box_middle - row->x_height() / 2, baseline, box_middle + row->x_height() / 2,
197                     static_cast<int>(baseline + row->x_height()));
198       // Compute overlap. If it is is a major overlap, this is the right row.
199       if (bbox.major_overlap(test_box)) {
200         return row->x_height();
201       }
202     }
203   }
204   // No row found for this bbox.
205   return kUnspecifiedXheight;
206 }
207 
208 // Returns a list of regions (boxes) which should be cleared in the original
209 // image so as to perform shiro-rekha splitting. Pix is assumed to carry one
210 // (or less) word only. Xheight measure could be the global estimate, the row
211 // estimate, or unspecified. If unspecified, over splitting may occur, since a
212 // conservative estimate of stroke width along with an associated multiplier
213 // is used in its place. It is advisable to have a specified xheight when
214 // splitting for classification/training.
215 // A vertical projection histogram of all the on-pixels in the input pix is
216 // computed. The maxima of this histogram is regarded as an approximate location
217 // of the shiro-rekha. By descending on the maxima's peak on both sides,
218 // stroke width of shiro-rekha is estimated.
219 // A horizontal projection histogram is computed for a sub-image of the input
220 // image, which extends from just below the shiro-rekha down to a certain
221 // leeway. The leeway depends on the input xheight, if provided, else a
222 // conservative multiplier on approximate stroke width is used (which may lead
223 // to over-splitting).
SplitWordShiroRekha(SplitStrategy split_strategy,Image pix,int xheight,int word_left,int word_top,Boxa * regions_to_clear)224 void ShiroRekhaSplitter::SplitWordShiroRekha(SplitStrategy split_strategy, Image pix, int xheight,
225                                              int word_left, int word_top, Boxa *regions_to_clear) {
226   if (split_strategy == NO_SPLIT) {
227     return;
228   }
229   int width = pixGetWidth(pix);
230   int height = pixGetHeight(pix);
231   // Statistically determine the yextents of the shiro-rekha.
232   int shirorekha_top, shirorekha_bottom, shirorekha_ylevel;
233   GetShiroRekhaYExtents(pix, &shirorekha_top, &shirorekha_bottom, &shirorekha_ylevel);
234   // Since the shiro rekha is also a stroke, its width is equal to the stroke
235   // width.
236   int stroke_width = shirorekha_bottom - shirorekha_top + 1;
237 
238   // Some safeguards to protect CCs we do not want to be split.
239   // These are particularly useful when the word wasn't eliminated earlier
240   // because xheight information was unavailable.
241   if (shirorekha_ylevel > height / 2) {
242     // Shirorekha shouldn't be in the bottom half of the word.
243     if (devanagari_split_debuglevel > 0) {
244       tprintf("Skipping splitting CC at (%d, %d): shirorekha in lower half..\n", word_left,
245               word_top);
246     }
247     return;
248   }
249   if (stroke_width > height / 3) {
250     // Even the boldest of fonts shouldn't do this.
251     if (devanagari_split_debuglevel > 0) {
252       tprintf("Skipping splitting CC at (%d, %d): stroke width too huge..\n", word_left, word_top);
253     }
254     return;
255   }
256 
257   // Clear the ascender and descender regions of the word.
258   // Obtain a vertical projection histogram for the resulting image.
259   Box *box_to_clear = boxCreate(0, shirorekha_top - stroke_width / 3, width, 5 * stroke_width / 3);
260   Image word_in_xheight = pix.copy();
261   pixClearInRect(word_in_xheight, box_to_clear);
262   // Also clear any pixels which are below shirorekha_bottom + some leeway.
263   // The leeway is set to xheight if the information is available, else it is a
264   // multiplier applied to the stroke width.
265   int leeway_to_keep = stroke_width * 3;
266   if (xheight != kUnspecifiedXheight) {
267     // This is because the xheight-region typically includes the shiro-rekha
268     // inside it, i.e., the top of the xheight range corresponds to the top of
269     // shiro-rekha.
270     leeway_to_keep = xheight - stroke_width;
271   }
272   box_to_clear->y = shirorekha_bottom + leeway_to_keep;
273   box_to_clear->h = height - box_to_clear->y;
274   pixClearInRect(word_in_xheight, box_to_clear);
275   boxDestroy(&box_to_clear);
276 
277   PixelHistogram vert_hist;
278   vert_hist.ConstructVerticalCountHist(word_in_xheight);
279   word_in_xheight.destroy();
280 
281   // If the number of black pixel in any column of the image is less than a
282   // fraction of the stroke width, treat it as noise / a stray mark. Perform
283   // these changes inside the vert_hist data itself, as that is used later on as
284   // a bit vector for the final split decision at every column.
285   for (int i = 0; i < width; ++i) {
286     if (vert_hist.hist()[i] <= stroke_width / 4) {
287       vert_hist.hist()[i] = 0;
288     } else {
289       vert_hist.hist()[i] = 1;
290     }
291   }
292   // In order to split the line at any point, we make sure that the width of the
293   // gap is at least half the stroke width.
294   int i = 0;
295   int cur_component_width = 0;
296   while (i < width) {
297     if (!vert_hist.hist()[i]) {
298       int j = 0;
299       while (i + j < width && !vert_hist.hist()[i + j]) {
300         ++j;
301       }
302       if (j >= stroke_width / 2 && cur_component_width >= stroke_width / 2) {
303         // Perform a shiro-rekha split. The intervening region lies from i to
304         // i+j-1.
305         // A minimal single-pixel split makes the estimation of intra- and
306         // inter-word spacing easier during page layout analysis,
307         // whereas a maximal split may be needed for OCR, depending on
308         // how the engine was trained.
309         bool minimal_split = (split_strategy == MINIMAL_SPLIT);
310         int split_width = minimal_split ? 1 : j;
311         int split_left = minimal_split ? i + (j / 2) - (split_width / 2) : i;
312         if (!minimal_split || (i != 0 && i + j != width)) {
313           Box *box_to_clear =
314               boxCreate(word_left + split_left, word_top + shirorekha_top - stroke_width / 3,
315                         split_width, 5 * stroke_width / 3);
316           if (box_to_clear) {
317             boxaAddBox(regions_to_clear, box_to_clear, L_CLONE);
318             // Mark this in the debug image if needed.
319             if (devanagari_split_debugimage) {
320               pixRenderBoxArb(debug_image_, box_to_clear, 1, 128, 255, 128);
321             }
322             boxDestroy(&box_to_clear);
323             cur_component_width = 0;
324           }
325         }
326       }
327       i += j;
328     } else {
329       ++i;
330       ++cur_component_width;
331     }
332   }
333 }
334 
335 // Refreshes the words in the segmentation block list by using blobs in the
336 // input block list.
337 // The segmentation block list must be set.
RefreshSegmentationWithNewBlobs(C_BLOB_LIST * new_blobs)338 void ShiroRekhaSplitter::RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs) {
339   // The segmentation block list must have been specified.
340   ASSERT_HOST(segmentation_block_list_);
341   if (devanagari_split_debuglevel > 0) {
342     tprintf("Before refreshing blobs:\n");
343     PrintSegmentationStats(segmentation_block_list_);
344     tprintf("New Blobs found: %d\n", new_blobs->length());
345   }
346 
347   C_BLOB_LIST not_found_blobs;
348   RefreshWordBlobsFromNewBlobs(
349       segmentation_block_list_, new_blobs,
350       ((devanagari_split_debugimage && debug_image_) ? &not_found_blobs : nullptr));
351 
352   if (devanagari_split_debuglevel > 0) {
353     tprintf("After refreshing blobs:\n");
354     PrintSegmentationStats(segmentation_block_list_);
355   }
356   if (devanagari_split_debugimage && debug_image_) {
357     // Plot out the original blobs for which no match was found in the new
358     // all_blobs list.
359     C_BLOB_IT not_found_it(&not_found_blobs);
360     for (not_found_it.mark_cycle_pt(); !not_found_it.cycled_list(); not_found_it.forward()) {
361       C_BLOB *not_found = not_found_it.data();
362       TBOX not_found_box = not_found->bounding_box();
363       Box *box_to_plot = GetBoxForTBOX(not_found_box);
364       pixRenderBoxArb(debug_image_, box_to_plot, 1, 255, 0, 255);
365       boxDestroy(&box_to_plot);
366     }
367 
368     // Plot out the blobs unused from all blobs.
369     C_BLOB_IT all_blobs_it(new_blobs);
370     for (all_blobs_it.mark_cycle_pt(); !all_blobs_it.cycled_list(); all_blobs_it.forward()) {
371       C_BLOB *a_blob = all_blobs_it.data();
372       Box *box_to_plot = GetBoxForTBOX(a_blob->bounding_box());
373       pixRenderBoxArb(debug_image_, box_to_plot, 3, 0, 127, 0);
374       boxDestroy(&box_to_plot);
375     }
376   }
377 }
378 
379 // Returns a new box object for the corresponding TBOX, based on the original
380 // image's coordinate system.
GetBoxForTBOX(const TBOX & tbox) const381 Box *ShiroRekhaSplitter::GetBoxForTBOX(const TBOX &tbox) const {
382   return boxCreate(tbox.left(), pixGetHeight(orig_pix_) - tbox.top() - 1, tbox.width(),
383                    tbox.height());
384 }
385 
386 // This method returns the computed mode-height of blobs in the pix.
387 // It also prunes very small blobs from calculation.
GetModeHeight(Image pix)388 int ShiroRekhaSplitter::GetModeHeight(Image pix) {
389   Boxa *boxa = pixConnComp(pix, nullptr, 8);
390   STATS heights(0, pixGetHeight(pix));
391   heights.clear();
392   for (int i = 0; i < boxaGetCount(boxa); ++i) {
393     Box *box = boxaGetBox(boxa, i, L_CLONE);
394     if (box->h >= 3 || box->w >= 3) {
395       heights.add(box->h, 1);
396     }
397     boxDestroy(&box);
398   }
399   boxaDestroy(&boxa);
400   return heights.mode();
401 }
402 
403 // This method returns y-extents of the shiro-rekha computed from the input
404 // word image.
GetShiroRekhaYExtents(Image word_pix,int * shirorekha_top,int * shirorekha_bottom,int * shirorekha_ylevel)405 void ShiroRekhaSplitter::GetShiroRekhaYExtents(Image word_pix, int *shirorekha_top,
406                                                int *shirorekha_bottom, int *shirorekha_ylevel) {
407   // Compute a histogram from projecting the word on a vertical line.
408   PixelHistogram hist_horiz;
409   hist_horiz.ConstructHorizontalCountHist(word_pix);
410   // Get the ylevel where the top-line exists. This is basically the global
411   // maxima in the horizontal histogram.
412   int topline_onpixel_count = 0;
413   int topline_ylevel = hist_horiz.GetHistogramMaximum(&topline_onpixel_count);
414 
415   // Get the upper and lower extents of the shiro rekha.
416   int thresh = (topline_onpixel_count * 70) / 100;
417   int ulimit = topline_ylevel;
418   int llimit = topline_ylevel;
419   while (ulimit > 0 && hist_horiz.hist()[ulimit] >= thresh) {
420     --ulimit;
421   }
422   while (llimit < pixGetHeight(word_pix) && hist_horiz.hist()[llimit] >= thresh) {
423     ++llimit;
424   }
425 
426   if (shirorekha_top) {
427     *shirorekha_top = ulimit;
428   }
429   if (shirorekha_bottom) {
430     *shirorekha_bottom = llimit;
431   }
432   if (shirorekha_ylevel) {
433     *shirorekha_ylevel = topline_ylevel;
434   }
435 }
436 
437 // This method returns the global-maxima for the histogram. The frequency of
438 // the global maxima is returned in count, if specified.
GetHistogramMaximum(int * count) const439 int PixelHistogram::GetHistogramMaximum(int *count) const {
440   int best_value = 0;
441   for (int i = 0; i < length_; ++i) {
442     if (hist_[i] > hist_[best_value]) {
443       best_value = i;
444     }
445   }
446   if (count) {
447     *count = hist_[best_value];
448   }
449   return best_value;
450 }
451 
452 // Methods to construct histograms from images.
ConstructVerticalCountHist(Image pix)453 void PixelHistogram::ConstructVerticalCountHist(Image pix) {
454   Clear();
455   int width = pixGetWidth(pix);
456   int height = pixGetHeight(pix);
457   hist_ = new int[width];
458   length_ = width;
459   int wpl = pixGetWpl(pix);
460   l_uint32 *data = pixGetData(pix);
461   for (int i = 0; i < width; ++i) {
462     hist_[i] = 0;
463   }
464   for (int i = 0; i < height; ++i) {
465     l_uint32 *line = data + i * wpl;
466     for (int j = 0; j < width; ++j) {
467       if (GET_DATA_BIT(line, j)) {
468         ++(hist_[j]);
469       }
470     }
471   }
472 }
473 
ConstructHorizontalCountHist(Image pix)474 void PixelHistogram::ConstructHorizontalCountHist(Image pix) {
475   Clear();
476   Numa *counts = pixCountPixelsByRow(pix, nullptr);
477   length_ = numaGetCount(counts);
478   hist_ = new int[length_];
479   for (int i = 0; i < length_; ++i) {
480     l_int32 val = 0;
481     numaGetIValue(counts, i, &val);
482     hist_[i] = val;
483   }
484   numaDestroy(&counts);
485 }
486 
487 } // namespace tesseract.
488