1 /**********************************************************************
2  * File:        stringrenderer.cpp
3  * Description: Class for rendering UTF-8 text to an image, and retrieving
4  *              bounding boxes around each grapheme cluster.
5  * Author:      Ranjith Unnikrishnan
6  *
7  * (C) Copyright 2013, Google Inc.
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  * http://www.apache.org/licenses/LICENSE-2.0
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "stringrenderer.h"
21 
22 #include <allheaders.h> // from leptonica
23 #include "boxchar.h"
24 #include "helpers.h" // for TRand
25 #include "ligature_table.h"
26 #include "normstrngs.h"
27 #include "tlog.h"
28 
29 #include <tesseract/unichar.h>
30 
31 #include "pango/pango-font.h"
32 #include "pango/pango-glyph-item.h"
33 #include "unicode/uchar.h" // from libicu
34 
35 #include <algorithm>
36 #include <cassert>
37 #include <cstdio>
38 #include <cstring>
39 #include <map>
40 #include <utility>
41 #include <vector>
42 
43 #define DISABLE_HEAP_LEAK_CHECK
44 
45 namespace tesseract {
46 
47 static const int kDefaultOutputResolution = 300;
48 
49 // Word joiner (U+2060) inserted after letters in ngram mode, as per
50 // recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at
51 // hyphens and other non-alpha characters.
52 static const char *kWordJoinerUTF8 = "\u2060";
53 
IsCombiner(int ch)54 static bool IsCombiner(int ch) {
55   const int char_type = u_charType(ch);
56   return ((char_type == U_NON_SPACING_MARK) || (char_type == U_ENCLOSING_MARK) ||
57           (char_type == U_COMBINING_SPACING_MARK));
58 }
59 
EncodeAsUTF8(const char32 ch32)60 static std::string EncodeAsUTF8(const char32 ch32) {
61   UNICHAR uni_ch(ch32);
62   return std::string(uni_ch.utf8(), uni_ch.utf8_len());
63 }
64 
65 // Returns true with probability 'prob'.
RandBool(const double prob,TRand * rand)66 static bool RandBool(const double prob, TRand *rand) {
67   if (prob == 1.0) {
68     return true;
69   }
70   if (prob == 0.0) {
71     return false;
72   }
73   return rand->UnsignedRand(1.0) < prob;
74 }
75 
76 /* static */
CairoARGB32ToPixFormat(cairo_surface_t * surface)77 static Image CairoARGB32ToPixFormat(cairo_surface_t *surface) {
78   if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) {
79     printf("Unexpected surface format %d\n", cairo_image_surface_get_format(surface));
80     return nullptr;
81   }
82   const int width = cairo_image_surface_get_width(surface);
83   const int height = cairo_image_surface_get_height(surface);
84   Image pix = pixCreate(width, height, 32);
85   int byte_stride = cairo_image_surface_get_stride(surface);
86 
87   for (int i = 0; i < height; ++i) {
88     memcpy(reinterpret_cast<unsigned char *>(pix->data + i * pix->wpl) + 1,
89            cairo_image_surface_get_data(surface) + i * byte_stride,
90            byte_stride - ((i == height - 1) ? 1 : 0));
91   }
92   return pix;
93 }
94 
StringRenderer(const std::string & font_desc,int page_width,int page_height)95 StringRenderer::StringRenderer(const std::string &font_desc, int page_width, int page_height)
96     : font_(font_desc)
97     , page_width_(page_width)
98     , page_height_(page_height)
99     , h_margin_(50)
100     , v_margin_(50)
101     , pen_color_{0.0, 0.0, 0.0}
102     , char_spacing_(0)
103     , leading_(0)
104     , vertical_text_(false)
105     , gravity_hint_strong_(false)
106     , render_fullwidth_latin_(false)
107     , underline_start_prob_(0)
108     , underline_continuation_prob_(0)
109     , underline_style_(PANGO_UNDERLINE_SINGLE)
110     , drop_uncovered_chars_(true)
111     , strip_unrenderable_words_(false)
112     , add_ligatures_(false)
113     , output_word_boxes_(false)
114     , surface_(nullptr)
115     , cr_(nullptr)
116     , layout_(nullptr)
117     , start_box_(0)
118     , page_(0)
119     , box_padding_(0)
120     , page_boxes_(nullptr)
121     , total_chars_(0)
122     , font_index_(0)
123     , last_offset_(0) {
124   set_resolution(kDefaultOutputResolution);
125   set_font(font_desc);
126 }
127 
set_font(const std::string & desc)128 bool StringRenderer::set_font(const std::string &desc) {
129   bool success = font_.ParseFontDescriptionName(desc);
130   font_.set_resolution(resolution_);
131   return success;
132 }
133 
set_resolution(const int resolution)134 void StringRenderer::set_resolution(const int resolution) {
135   resolution_ = resolution;
136   font_.set_resolution(resolution);
137 }
138 
set_underline_start_prob(const double frac)139 void StringRenderer::set_underline_start_prob(const double frac) {
140   underline_start_prob_ = std::min(std::max(frac, 0.0), 1.0);
141 }
142 
set_underline_continuation_prob(const double frac)143 void StringRenderer::set_underline_continuation_prob(const double frac) {
144   underline_continuation_prob_ = std::min(std::max(frac, 0.0), 1.0);
145 }
146 
~StringRenderer()147 StringRenderer::~StringRenderer() {
148   ClearBoxes();
149   FreePangoCairo();
150 }
151 
InitPangoCairo()152 void StringRenderer::InitPangoCairo() {
153   FreePangoCairo();
154   surface_ = cairo_image_surface_create(CAIRO_FORMAT_ARGB32, page_width_, page_height_);
155   cr_ = cairo_create(surface_);
156   {
157     DISABLE_HEAP_LEAK_CHECK;
158     layout_ = pango_cairo_create_layout(cr_);
159   }
160 
161   if (vertical_text_) {
162     PangoContext *context = pango_layout_get_context(layout_);
163     pango_context_set_base_gravity(context, PANGO_GRAVITY_EAST);
164     if (gravity_hint_strong_) {
165       pango_context_set_gravity_hint(context, PANGO_GRAVITY_HINT_STRONG);
166     }
167     pango_layout_context_changed(layout_);
168   }
169 
170   SetLayoutProperties();
171 }
172 
SetLayoutProperties()173 void StringRenderer::SetLayoutProperties() {
174   std::string font_desc = font_.DescriptionName();
175   // Specify the font via a description name
176   PangoFontDescription *desc = pango_font_description_from_string(font_desc.c_str());
177   // Assign the font description to the layout
178   pango_layout_set_font_description(layout_, desc);
179   pango_font_description_free(desc); // free the description
180   pango_cairo_context_set_resolution(pango_layout_get_context(layout_), resolution_);
181 
182   int max_width = page_width_ - 2 * h_margin_;
183   int max_height = page_height_ - 2 * v_margin_;
184   tlog(3, "max_width = %d, max_height = %d\n", max_width, max_height);
185   if (vertical_text_) {
186     using std::swap;
187     swap(max_width, max_height);
188   }
189   pango_layout_set_width(layout_, max_width * PANGO_SCALE);
190   // Ultra-wide Thai strings need to wrap at char level.
191   pango_layout_set_wrap(layout_, PANGO_WRAP_WORD_CHAR);
192 
193   // Adjust character spacing
194   PangoAttrList *attr_list = pango_attr_list_new();
195   if (char_spacing_) {
196     PangoAttribute *spacing_attr = pango_attr_letter_spacing_new(char_spacing_ * PANGO_SCALE);
197     spacing_attr->start_index = 0;
198     spacing_attr->end_index = static_cast<guint>(-1);
199     pango_attr_list_change(attr_list, spacing_attr);
200   }
201 
202   if (add_ligatures_) {
203     set_features("liga, clig, dlig, hlig");
204     PangoAttribute *feature_attr = pango_attr_font_features_new(features_.c_str());
205     pango_attr_list_change(attr_list, feature_attr);
206   }
207 
208   pango_layout_set_attributes(layout_, attr_list);
209   pango_attr_list_unref(attr_list);
210   // Adjust line spacing
211   if (leading_) {
212     pango_layout_set_spacing(layout_, leading_ * PANGO_SCALE);
213   }
214 }
215 
FreePangoCairo()216 void StringRenderer::FreePangoCairo() {
217   if (layout_) {
218     g_object_unref(layout_);
219     layout_ = nullptr;
220   }
221   if (cr_) {
222     cairo_destroy(cr_);
223     cr_ = nullptr;
224   }
225   if (surface_) {
226     cairo_surface_destroy(surface_);
227     surface_ = nullptr;
228   }
229 }
230 
SetWordUnderlineAttributes(const std::string & page_text)231 void StringRenderer::SetWordUnderlineAttributes(const std::string &page_text) {
232   if (underline_start_prob_ == 0) {
233     return;
234   }
235   PangoAttrList *attr_list = pango_layout_get_attributes(layout_);
236 
237   const char *text = page_text.c_str();
238   size_t offset = 0;
239   TRand rand;
240   bool started_underline = false;
241   PangoAttribute *und_attr = nullptr;
242 
243   while (offset < page_text.length()) {
244     offset += SpanUTF8Whitespace(text + offset);
245     if (offset == page_text.length()) {
246       break;
247     }
248 
249     int word_start = offset;
250     int word_len = SpanUTF8NotWhitespace(text + offset);
251     offset += word_len;
252     if (started_underline) {
253       // Should we continue the underline to the next word?
254       if (RandBool(underline_continuation_prob_, &rand)) {
255         // Continue the current underline to this word.
256         und_attr->end_index = word_start + word_len;
257       } else {
258         // Otherwise end the current underline attribute at the end of the
259         // previous word.
260         pango_attr_list_insert(attr_list, und_attr);
261         started_underline = false;
262         und_attr = nullptr;
263       }
264     }
265     if (!started_underline && RandBool(underline_start_prob_, &rand)) {
266       // Start a new underline attribute
267       und_attr = pango_attr_underline_new(underline_style_);
268       und_attr->start_index = word_start;
269       und_attr->end_index = word_start + word_len;
270       started_underline = true;
271     }
272   }
273   // Finish the current underline attribute at the end of the page.
274   if (started_underline) {
275     und_attr->end_index = page_text.length();
276     pango_attr_list_insert(attr_list, und_attr);
277   }
278 }
279 
280 // Returns offset in utf8 bytes to first page.
FindFirstPageBreakOffset(const char * text,int text_length)281 int StringRenderer::FindFirstPageBreakOffset(const char *text, int text_length) {
282   if (!text_length) {
283     return 0;
284   }
285   const int max_height = (page_height_ - 2 * v_margin_);
286   const int max_width = (page_width_ - 2 * h_margin_);
287   const int max_layout_height = vertical_text_ ? max_width : max_height;
288 
289   UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
290   const UNICHAR::const_iterator it_end = UNICHAR::end(text, text_length);
291   const int kMaxUnicodeBufLength = 15000;
292   for (int i = 0; i < kMaxUnicodeBufLength && it != it_end; ++it, ++i) {
293     ;
294   }
295   int buf_length = it.utf8_data() - text;
296   tlog(1, "len = %d  buf_len = %d\n", text_length, buf_length);
297   pango_layout_set_text(layout_, text, buf_length);
298 
299   PangoLayoutIter *line_iter = nullptr;
300   { // Fontconfig caches some info here that is not freed before exit.
301     DISABLE_HEAP_LEAK_CHECK;
302     line_iter = pango_layout_get_iter(layout_);
303   }
304   bool first_page = true;
305   int page_top = 0;
306   int offset = buf_length;
307   do {
308     // Get bounding box of the current line
309     PangoRectangle line_ink_rect;
310     pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, nullptr);
311     pango_extents_to_pixels(&line_ink_rect, nullptr);
312     PangoLayoutLine *line = pango_layout_iter_get_line_readonly(line_iter);
313     if (first_page) {
314       page_top = line_ink_rect.y;
315       first_page = false;
316     }
317     int line_bottom = line_ink_rect.y + line_ink_rect.height;
318     if (line_bottom - page_top > max_layout_height) {
319       offset = line->start_index;
320       tlog(1, "Found offset = %d\n", offset);
321       break;
322     }
323   } while (pango_layout_iter_next_line(line_iter));
324   pango_layout_iter_free(line_iter);
325   return offset;
326 }
327 
GetBoxes() const328 const std::vector<BoxChar *> &StringRenderer::GetBoxes() const {
329   return boxchars_;
330 }
331 
GetPageBoxes() const332 Boxa *StringRenderer::GetPageBoxes() const {
333   return page_boxes_;
334 }
335 
RotatePageBoxes(float rotation)336 void StringRenderer::RotatePageBoxes(float rotation) {
337   BoxChar::RotateBoxes(rotation, page_width_ / 2, page_height_ / 2, start_box_, boxchars_.size(),
338                        &boxchars_);
339 }
340 
ClearBoxes()341 void StringRenderer::ClearBoxes() {
342   for (auto &boxchar : boxchars_) {
343     delete boxchar;
344   }
345   boxchars_.clear();
346   boxaDestroy(&page_boxes_);
347 }
348 
GetBoxesStr()349 std::string StringRenderer::GetBoxesStr() {
350   BoxChar::PrepareToWrite(&boxchars_);
351   return BoxChar::GetTesseractBoxStr(page_height_, boxchars_);
352 }
353 
WriteAllBoxes(const std::string & filename)354 void StringRenderer::WriteAllBoxes(const std::string &filename) {
355   BoxChar::PrepareToWrite(&boxchars_);
356   BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_);
357 }
358 
359 // Returns cluster strings in logical order.
GetClusterStrings(std::vector<std::string> * cluster_text)360 bool StringRenderer::GetClusterStrings(std::vector<std::string> *cluster_text) {
361   std::map<int, std::string> start_byte_to_text;
362   PangoLayoutIter *run_iter = pango_layout_get_iter(layout_);
363   const char *full_text = pango_layout_get_text(layout_);
364   do {
365     PangoLayoutRun *run = pango_layout_iter_get_run_readonly(run_iter);
366     if (!run) {
367       // End of line nullptr run marker
368       tlog(2, "Found end of line marker\n");
369       continue;
370     }
371     PangoGlyphItemIter cluster_iter;
372     gboolean have_cluster;
373     for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter, run, full_text);
374          have_cluster; have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
375       const int start_byte_index = cluster_iter.start_index;
376       const int end_byte_index = cluster_iter.end_index;
377       std::string text =
378           std::string(full_text + start_byte_index, end_byte_index - start_byte_index);
379       if (IsUTF8Whitespace(text.c_str())) {
380         tlog(2, "Found whitespace\n");
381         text = " ";
382       }
383       tlog(2, "start_byte=%d end_byte=%d : '%s'\n", start_byte_index, end_byte_index, text.c_str());
384       if (add_ligatures_) {
385         // Make sure the output box files have ligatured text in case the font
386         // decided to use an unmapped glyph.
387         text = LigatureTable::Get()->AddLigatures(text, nullptr);
388       }
389       start_byte_to_text[start_byte_index] = text;
390     }
391   } while (pango_layout_iter_next_run(run_iter));
392   pango_layout_iter_free(run_iter);
393 
394   cluster_text->clear();
395   for (auto it = start_byte_to_text.begin(); it != start_byte_to_text.end(); ++it) {
396     cluster_text->push_back(it->second);
397   }
398   return !cluster_text->empty();
399 }
400 
401 // Merges an array of BoxChars into words based on the identification of
402 // BoxChars containing the space character as inter-word separators.
403 //
404 // Sometime two adjacent characters in the sequence may be detected as lying on
405 // different lines based on their spatial positions. This may be the result of a
406 // newline character at end of the last word on a line in the source text, or of
407 // a discretionary line-break created by Pango at intra-word locations like
408 // hyphens. When this is detected the word is split at that location into
409 // multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and
410 // its bounding box.
MergeBoxCharsToWords(std::vector<BoxChar * > * boxchars)411 static void MergeBoxCharsToWords(std::vector<BoxChar *> *boxchars) {
412   std::vector<BoxChar *> result;
413   bool started_word = false;
414   for (auto &boxchar : *boxchars) {
415     if (boxchar->ch() == " " || boxchar->box() == nullptr) {
416       result.push_back(boxchar);
417       boxchar = nullptr;
418       started_word = false;
419       continue;
420     }
421 
422     if (!started_word) {
423       // Begin new word
424       started_word = true;
425       result.push_back(boxchar);
426       boxchar = nullptr;
427     } else {
428       BoxChar *last_boxchar = result.back();
429       // Compute bounding box union
430       const Box *box = boxchar->box();
431       Box *last_box = last_boxchar->mutable_box();
432       int left = std::min(last_box->x, box->x);
433       int right = std::max(last_box->x + last_box->w, box->x + box->w);
434       int top = std::min(last_box->y, box->y);
435       int bottom = std::max(last_box->y + last_box->h, box->y + box->h);
436       // Conclude that the word was broken to span multiple lines based on the
437       // size of the merged bounding box in relation to those of the individual
438       // characters seen so far.
439       if (right - left > last_box->w + 5 * box->w) {
440         tlog(1, "Found line break after '%s'", last_boxchar->ch().c_str());
441         // Insert a fake interword space and start a new word with the current
442         // boxchar.
443         result.push_back(new BoxChar(" ", 1));
444         result.push_back(boxchar);
445         boxchar = nullptr;
446         continue;
447       }
448       // Append to last word
449       last_boxchar->mutable_ch()->append(boxchar->ch());
450       last_box->x = left;
451       last_box->w = right - left;
452       last_box->y = top;
453       last_box->h = bottom - top;
454       delete boxchar;
455       boxchar = nullptr;
456     }
457   }
458   boxchars->swap(result);
459 }
460 
ComputeClusterBoxes()461 void StringRenderer::ComputeClusterBoxes() {
462   const char *text = pango_layout_get_text(layout_);
463   PangoLayoutIter *cluster_iter = pango_layout_get_iter(layout_);
464 
465   // Do a first pass to store cluster start indexes.
466   std::vector<int> cluster_start_indices;
467   do {
468     cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter));
469     tlog(3, "Added %d\n", cluster_start_indices.back());
470   } while (pango_layout_iter_next_cluster(cluster_iter));
471   pango_layout_iter_free(cluster_iter);
472   cluster_start_indices.push_back(strlen(text));
473   tlog(3, "Added last index %d\n", cluster_start_indices.back());
474   // Sort the indices and create a map from start to end indices.
475   std::sort(cluster_start_indices.begin(), cluster_start_indices.end());
476   std::map<int, int> cluster_start_to_end_index;
477   for (size_t i = 0; i + 1 < cluster_start_indices.size(); ++i) {
478     cluster_start_to_end_index[cluster_start_indices[i]] = cluster_start_indices[i + 1];
479   }
480 
481   // Iterate again to compute cluster boxes and their text with the obtained
482   // cluster extent information.
483   cluster_iter = pango_layout_get_iter(layout_);
484   // Store BoxChars* sorted by their byte start positions
485   std::map<int, BoxChar *> start_byte_to_box;
486   do {
487     PangoRectangle cluster_rect;
488     pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect, nullptr);
489     pango_extents_to_pixels(&cluster_rect, nullptr);
490     const int start_byte_index = pango_layout_iter_get_index(cluster_iter);
491     const int end_byte_index = cluster_start_to_end_index[start_byte_index];
492     std::string cluster_text =
493         std::string(text + start_byte_index, end_byte_index - start_byte_index);
494     if (!cluster_text.empty() && cluster_text[0] == '\n') {
495       tlog(2, "Skipping newlines at start of text.\n");
496       continue;
497     }
498     if (!cluster_rect.width || !cluster_rect.height || IsUTF8Whitespace(cluster_text.c_str())) {
499       tlog(2, "Skipping whitespace with boxdim (%d,%d) '%s'\n", cluster_rect.width,
500            cluster_rect.height, cluster_text.c_str());
501       auto *boxchar = new BoxChar(" ", 1);
502       boxchar->set_page(page_);
503       start_byte_to_box[start_byte_index] = boxchar;
504       continue;
505     }
506     // Prepare a boxchar for addition at this byte position.
507     tlog(2, "[%d %d], %d, %d : start_byte=%d end_byte=%d : '%s'\n", cluster_rect.x, cluster_rect.y,
508          cluster_rect.width, cluster_rect.height, start_byte_index, end_byte_index,
509          cluster_text.c_str());
510     ASSERT_HOST_MSG(cluster_rect.width, "cluster_text:%s  start_byte_index:%d\n",
511                     cluster_text.c_str(), start_byte_index);
512     ASSERT_HOST_MSG(cluster_rect.height, "cluster_text:%s  start_byte_index:%d\n",
513                     cluster_text.c_str(), start_byte_index);
514     if (box_padding_) {
515       cluster_rect.x = std::max(0, cluster_rect.x - box_padding_);
516       cluster_rect.width += 2 * box_padding_;
517       cluster_rect.y = std::max(0, cluster_rect.y - box_padding_);
518       cluster_rect.height += 2 * box_padding_;
519     }
520     if (add_ligatures_) {
521       // Make sure the output box files have ligatured text in case the font
522       // decided to use an unmapped glyph.
523       cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, nullptr);
524     }
525     auto *boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size());
526     boxchar->set_page(page_);
527     boxchar->AddBox(cluster_rect.x, cluster_rect.y, cluster_rect.width, cluster_rect.height);
528     start_byte_to_box[start_byte_index] = boxchar;
529   } while (pango_layout_iter_next_cluster(cluster_iter));
530   pango_layout_iter_free(cluster_iter);
531 
532   // There is a subtle bug in the cluster text reported by the PangoLayoutIter
533   // on ligatured characters (eg. The word "Lam-Aliph" in arabic). To work
534   // around this, we use text reported using the PangoGlyphIter which is
535   // accurate.
536   // TODO(ranjith): Revisit whether this is still needed in newer versions of
537   // pango.
538   std::vector<std::string> cluster_text;
539   if (GetClusterStrings(&cluster_text)) {
540     ASSERT_HOST(cluster_text.size() == start_byte_to_box.size());
541     int ind = 0;
542     for (auto it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it, ++ind) {
543       it->second->mutable_ch()->swap(cluster_text[ind]);
544     }
545   }
546 
547   // Append to the boxchars list in byte order.
548   std::vector<BoxChar *> page_boxchars;
549   page_boxchars.reserve(start_byte_to_box.size());
550   std::string last_ch;
551   for (auto it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it) {
552     if (it->second->ch() == kWordJoinerUTF8) {
553       // Skip zero-width joiner characters (ZWJs) here.
554       delete it->second;
555     } else {
556       page_boxchars.push_back(it->second);
557     }
558   }
559   CorrectBoxPositionsToLayout(&page_boxchars);
560 
561   if (render_fullwidth_latin_) {
562     for (auto &it : start_byte_to_box) {
563       // Convert fullwidth Latin characters to their halfwidth forms.
564       std::string half(ConvertFullwidthLatinToBasicLatin(it.second->ch()));
565       it.second->mutable_ch()->swap(half);
566     }
567   }
568 
569   // Merge the character boxes into word boxes if we are rendering n-grams.
570   if (output_word_boxes_) {
571     MergeBoxCharsToWords(&page_boxchars);
572   }
573 
574   boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end());
575 
576   // Compute the page bounding box
577   Box *page_box = nullptr;
578   Boxa *all_boxes = nullptr;
579   for (auto &page_boxchar : page_boxchars) {
580     if (page_boxchar->box() == nullptr) {
581       continue;
582     }
583     if (all_boxes == nullptr) {
584       all_boxes = boxaCreate(0);
585     }
586     boxaAddBox(all_boxes, page_boxchar->mutable_box(), L_CLONE);
587   }
588   if (all_boxes != nullptr) {
589     boxaGetExtent(all_boxes, nullptr, nullptr, &page_box);
590     boxaDestroy(&all_boxes);
591     if (page_boxes_ == nullptr) {
592       page_boxes_ = boxaCreate(0);
593     }
594     boxaAddBox(page_boxes_, page_box, L_INSERT);
595   }
596 }
597 
CorrectBoxPositionsToLayout(std::vector<BoxChar * > * boxchars)598 void StringRenderer::CorrectBoxPositionsToLayout(std::vector<BoxChar *> *boxchars) {
599   if (vertical_text_) {
600     const double rotation = -pango_gravity_to_rotation(
601         pango_context_get_base_gravity(pango_layout_get_context(layout_)));
602     BoxChar::TranslateBoxes(page_width_ - h_margin_, v_margin_, boxchars);
603     BoxChar::RotateBoxes(rotation, page_width_ - h_margin_, v_margin_, 0, boxchars->size(),
604                          boxchars);
605   } else {
606     BoxChar::TranslateBoxes(h_margin_, v_margin_, boxchars);
607   }
608 }
609 
StripUnrenderableWords(std::string * utf8_text) const610 int StringRenderer::StripUnrenderableWords(std::string *utf8_text) const {
611   std::string output_text;
612   const char *text = utf8_text->c_str();
613   size_t offset = 0;
614   int num_dropped = 0;
615   while (offset < utf8_text->length()) {
616     int space_len = SpanUTF8Whitespace(text + offset);
617     output_text.append(text + offset, space_len);
618     offset += space_len;
619     if (offset == utf8_text->length()) {
620       break;
621     }
622 
623     int word_len = SpanUTF8NotWhitespace(text + offset);
624     if (font_.CanRenderString(text + offset, word_len)) {
625       output_text.append(text + offset, word_len);
626     } else {
627       ++num_dropped;
628     }
629     offset += word_len;
630   }
631   utf8_text->swap(output_text);
632 
633   if (num_dropped > 0) {
634     tprintf("Stripped %d unrenderable words\n", num_dropped);
635   }
636   return num_dropped;
637 }
638 
RenderToGrayscaleImage(const char * text,int text_length,Image * pix)639 int StringRenderer::RenderToGrayscaleImage(const char *text, int text_length, Image *pix) {
640   Image orig_pix = nullptr;
641   int offset = RenderToImage(text, text_length, &orig_pix);
642   if (orig_pix) {
643     *pix = pixConvertTo8(orig_pix, false);
644     orig_pix.destroy();
645   }
646   return offset;
647 }
648 
RenderToBinaryImage(const char * text,int text_length,int threshold,Image * pix)649 int StringRenderer::RenderToBinaryImage(const char *text, int text_length, int threshold,
650                                         Image *pix) {
651   Image orig_pix = nullptr;
652   int offset = RenderToImage(text, text_length, &orig_pix);
653   if (orig_pix) {
654     Image gray_pix = pixConvertTo8(orig_pix, false);
655     orig_pix.destroy();
656     *pix = pixThresholdToBinary(gray_pix, threshold);
657     gray_pix.destroy();
658   } else {
659     *pix = orig_pix;
660   }
661   return offset;
662 }
663 
664 // Add word joiner (WJ) characters between adjacent non-space characters except
665 // immediately before a combiner.
666 /* static */
InsertWordJoiners(const std::string & text)667 std::string StringRenderer::InsertWordJoiners(const std::string &text) {
668   std::string out_str;
669   const UNICHAR::const_iterator it_end = UNICHAR::end(text.c_str(), text.length());
670   for (UNICHAR::const_iterator it = UNICHAR::begin(text.c_str(), text.length()); it != it_end;
671        ++it) {
672     // Add the symbol to the output string.
673     out_str.append(it.utf8_data(), it.utf8_len());
674     // Check the next symbol.
675     UNICHAR::const_iterator next_it = it;
676     ++next_it;
677     bool next_char_is_boundary = (next_it == it_end || *next_it == ' ');
678     bool next_char_is_combiner = (next_it == it_end) ? false : IsCombiner(*next_it);
679     if (*it != ' ' && *it != '\n' && !next_char_is_boundary && !next_char_is_combiner) {
680       out_str += kWordJoinerUTF8;
681     }
682   }
683   return out_str;
684 }
685 
686 // Convert halfwidth Basic Latin characters to their fullwidth forms.
ConvertBasicLatinToFullwidthLatin(const std::string & str)687 std::string StringRenderer::ConvertBasicLatinToFullwidthLatin(const std::string &str) {
688   std::string full_str;
689   const UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
690   for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length()); it != it_end; ++it) {
691     // Convert printable and non-space 7-bit ASCII characters to
692     // their fullwidth forms.
693     if (IsInterchangeValid7BitAscii(*it) && isprint(*it) && !isspace(*it)) {
694       // Convert by adding 0xFEE0 to the codepoint of 7-bit ASCII.
695       char32 full_char = *it + 0xFEE0;
696       full_str.append(EncodeAsUTF8(full_char));
697     } else {
698       full_str.append(it.utf8_data(), it.utf8_len());
699     }
700   }
701   return full_str;
702 }
703 
704 // Convert fullwidth Latin characters to their halfwidth forms.
ConvertFullwidthLatinToBasicLatin(const std::string & str)705 std::string StringRenderer::ConvertFullwidthLatinToBasicLatin(const std::string &str) {
706   std::string half_str;
707   UNICHAR::const_iterator it_end = UNICHAR::end(str.c_str(), str.length());
708   for (UNICHAR::const_iterator it = UNICHAR::begin(str.c_str(), str.length()); it != it_end; ++it) {
709     char32 half_char = FullwidthToHalfwidth(*it);
710     // Convert fullwidth Latin characters to their halfwidth forms
711     // only if halfwidth forms are printable and non-space 7-bit ASCII.
712     if (IsInterchangeValid7BitAscii(half_char) && isprint(half_char) && !isspace(half_char)) {
713       half_str.append(EncodeAsUTF8(half_char));
714     } else {
715       half_str.append(it.utf8_data(), it.utf8_len());
716     }
717   }
718   return half_str;
719 }
720 
721 // Returns offset to end of text substring rendered in this method.
RenderToImage(const char * text,int text_length,Image * pix)722 int StringRenderer::RenderToImage(const char *text, int text_length, Image *pix) {
723   if (pix && *pix) {
724     pix->destroy();
725   }
726   InitPangoCairo();
727 
728   const int page_offset = FindFirstPageBreakOffset(text, text_length);
729   if (!page_offset) {
730     return 0;
731   }
732   start_box_ = boxchars_.size();
733 
734   if (!vertical_text_) {
735     // Translate by the specified margin
736     cairo_translate(cr_, h_margin_, v_margin_);
737   } else {
738     // Vertical text rendering is achieved by a two-step process of first
739     // performing regular horizontal layout with character orientation set to
740     // EAST, and then translating and rotating the layout before rendering onto
741     // the desired image surface. The settings required for the former step are
742     // done within InitPangoCairo().
743     //
744     // Translate to the top-right margin of page
745     cairo_translate(cr_, page_width_ - h_margin_, v_margin_);
746     // Rotate the layout
747     double rotation = -pango_gravity_to_rotation(
748         pango_context_get_base_gravity(pango_layout_get_context(layout_)));
749     tlog(2, "Rotating by %f radians\n", rotation);
750     cairo_rotate(cr_, rotation);
751     pango_cairo_update_layout(cr_, layout_);
752   }
753   std::string page_text(text, page_offset);
754   if (render_fullwidth_latin_) {
755     // Convert Basic Latin to their fullwidth forms.
756     page_text = ConvertBasicLatinToFullwidthLatin(page_text);
757   }
758   if (strip_unrenderable_words_) {
759     StripUnrenderableWords(&page_text);
760   }
761   if (drop_uncovered_chars_ && !font_.CoversUTF8Text(page_text.c_str(), page_text.length())) {
762     int num_dropped = font_.DropUncoveredChars(&page_text);
763     if (num_dropped) {
764       tprintf("WARNING: Dropped %d uncovered characters\n", num_dropped);
765     }
766   }
767   if (add_ligatures_) {
768     // Add ligatures wherever possible, including custom ligatures.
769     page_text = LigatureTable::Get()->AddLigatures(page_text, &font_);
770   }
771   if (underline_start_prob_ > 0) {
772     SetWordUnderlineAttributes(page_text);
773   }
774 
775   pango_layout_set_text(layout_, page_text.c_str(), page_text.length());
776 
777   if (pix) {
778     // Set a white background for the target image surface.
779     cairo_set_source_rgb(cr_, 1.0, 1.0, 1.0); // sets drawing colour to white
780     // Fill the surface with the active colour (if you don't do this, you will
781     // be given a surface with a transparent background to draw on)
782     cairo_paint(cr_);
783     // Set the ink color to black
784     cairo_set_source_rgb(cr_, pen_color_[0], pen_color_[1], pen_color_[2]);
785     // If the target surface or transformation properties of the cairo instance
786     // have changed, update the pango layout to reflect this
787     pango_cairo_update_layout(cr_, layout_);
788     {
789       DISABLE_HEAP_LEAK_CHECK; // for Fontconfig
790       // Draw the pango layout onto the cairo surface
791       pango_cairo_show_layout(cr_, layout_);
792     }
793     *pix = CairoARGB32ToPixFormat(surface_);
794   }
795   ComputeClusterBoxes();
796   FreePangoCairo();
797   // Update internal state variables.
798   ++page_;
799   return page_offset;
800 }
801 
802 // Render a string to an image, returning it as an 8 bit pix.  Behaves as
803 // RenderString, except that it ignores the font set at construction and works
804 // through all the fonts, returning 0 until they are exhausted, at which point
805 // it returns the value it should have returned all along, but no pix this time.
806 // Fonts that don't contain a given proportion of the characters in the string
807 // get skipped.
808 // Fonts that work each get rendered and the font name gets added
809 // to the image.
810 // NOTE that no boxes are produced by this function.
811 //
812 // Example usage: To render a null terminated char-array "txt"
813 //
814 // int offset = 0;
815 // do {
816 //   Image pix;
817 //   offset += renderer.RenderAllFontsToImage(min_proportion, txt + offset,
818 //                                            strlen(txt + offset), nullptr,
819 //                                            &pix);
820 //   ...
821 // } while (offset < strlen(text));
822 //
RenderAllFontsToImage(double min_coverage,const char * text,int text_length,std::string * font_used,Image * image)823 int StringRenderer::RenderAllFontsToImage(double min_coverage, const char *text, int text_length,
824                                           std::string *font_used, Image *image) {
825   *image = nullptr;
826   // Select a suitable font to render the title with.
827   const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%";
828   std::string title_font;
829   if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate), &title_font, nullptr)) {
830     tprintf("WARNING: Could not find a font to render image title with!\n");
831     title_font = "Arial";
832   }
833   title_font += " 8";
834   tlog(1, "Selected title font: %s\n", title_font.c_str());
835   if (font_used) {
836     font_used->clear();
837   }
838 
839   std::string orig_font = font_.DescriptionName();
840   if (char_map_.empty()) {
841     total_chars_ = 0;
842     // Fill the hash table and use that for computing which fonts to use.
843     for (UNICHAR::const_iterator it = UNICHAR::begin(text, text_length);
844          it != UNICHAR::end(text, text_length); ++it) {
845       ++total_chars_;
846       ++char_map_[*it];
847     }
848     tprintf("Total chars = %d\n", total_chars_);
849   }
850   const std::vector<std::string> &all_fonts = FontUtils::ListAvailableFonts();
851 
852   for (size_t i = font_index_; i < all_fonts.size(); ++i) {
853     ++font_index_;
854     int raw_score = 0;
855     int ok_chars = FontUtils::FontScore(char_map_, all_fonts[i], &raw_score, nullptr);
856     if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) {
857       set_font(all_fonts[i]);
858       int offset = RenderToBinaryImage(text, text_length, 128, image);
859       ClearBoxes(); // Get rid of them as they are garbage.
860       const int kMaxTitleLength = 1024;
861       char title[kMaxTitleLength];
862       snprintf(title, kMaxTitleLength, kTitleTemplate, all_fonts[i].c_str(), ok_chars,
863                100.0 * ok_chars / total_chars_, raw_score, 100.0 * raw_score / char_map_.size());
864       tprintf("%s\n", title);
865       // This is a good font! Store the offset to return once we've tried all
866       // the fonts.
867       if (offset) {
868         last_offset_ = offset;
869         if (font_used) {
870           *font_used = all_fonts[i];
871         }
872       }
873       // Add the font to the image.
874       set_font(title_font);
875       v_margin_ /= 8;
876       Image title_image = nullptr;
877       RenderToBinaryImage(title, strlen(title), 128, &title_image);
878       *image |= title_image;
879       title_image.destroy();
880 
881       v_margin_ *= 8;
882       set_font(orig_font);
883       // We return the real offset only after cycling through the list of fonts.
884       return 0;
885     } else {
886       tprintf("Font %s failed with %d hits = %.2f%%\n", all_fonts[i].c_str(), ok_chars,
887               100.0 * ok_chars / total_chars_);
888     }
889   }
890   font_index_ = 0;
891   char_map_.clear();
892   return last_offset_ == 0 ? -1 : last_offset_;
893 }
894 
895 } // namespace tesseract
896