1 /**********************************************************************
2  * File:        text2image.cpp
3  * Description: Program to generate OCR training pages. Given a text file it
4  *              outputs an image with a given font and degradation.
5  *
6  *              Note that since the results depend on the fonts available on
7  *              your system, running the code on a different machine, or
8  *              different OS, or even at a different time on the same machine,
9  *              may produce different fonts even if --font is given explicitly.
10  *              To see names of available fonts, use --list_available_fonts with
11  *              the appropriate --fonts_dir path.
12  *              Specifying --use_only_legacy_fonts will restrict the available
13  *              fonts to those listed in legacy_fonts.h
14  * Authors:     Ranjith Unnikrishnan, Ray Smith
15  *
16  * (C) Copyright 2013, Google Inc.
17  * Licensed under the Apache License, Version 2.0 (the "License");
18  * you may not use this file except in compliance with the License.
19  * You may obtain a copy of the License at
20  * http://www.apache.org/licenses/LICENSE-2.0
21  * Unless required by applicable law or agreed to in writing, software
22  * distributed under the License is distributed on an "AS IS" BASIS,
23  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24  * See the License for the specific language governing permissions and
25  * limitations under the License.
26  *
27  **********************************************************************/
28 
29 #include "boxchar.h"
30 #include "commandlineflags.h"
31 #include "commontraining.h" // CheckSharedLibraryVersion
32 #include "degradeimage.h"
33 #include "errcode.h"
34 #include "fileio.h"
35 #include "helpers.h"
36 #include "normstrngs.h"
37 #include "stringrenderer.h"
38 #include "tlog.h"
39 #include "unicharset.h"
40 
41 #include <allheaders.h> // from leptonica
42 
43 #include <algorithm>
44 #include <cstdlib>
45 #include <cstring>
46 #include <iostream>
47 #include <map>
48 #include <random>
49 #include <string>
50 #include <utility>
51 #include <vector>
52 
53 #ifdef _MSC_VER
54 #  define putenv(s) _putenv(s)
55 #endif
56 
57 using namespace tesseract;
58 
59 // A number with which to initialize the random number generator.
60 const int kRandomSeed = 0x18273645;
61 
62 // The text input file.
63 static STRING_PARAM_FLAG(text, "", "File name of text input to process");
64 
65 // The text output file.
66 static STRING_PARAM_FLAG(outputbase, "", "Basename for output image/box file");
67 
68 // Degrade the rendered image to mimic scanner quality.
69 static BOOL_PARAM_FLAG(degrade_image, true,
70                        "Degrade rendered image with speckle noise, dilation/erosion "
71                        "and rotation");
72 
73 // Rotate the rendered image to have more realistic glyph borders
74 static BOOL_PARAM_FLAG(rotate_image, true, "Rotate the image in a random way.");
75 
76 // Degradation to apply to the image.
77 static INT_PARAM_FLAG(exposure, 0, "Exposure level in photocopier");
78 
79 // Distort the rendered image by various means according to the bool flags.
80 static BOOL_PARAM_FLAG(distort_image, false, "Degrade rendered image with noise, blur, invert.");
81 
82 // Distortion to apply to the image.
83 static BOOL_PARAM_FLAG(invert, true, "Invert the image");
84 
85 // Distortion to apply to the image.
86 static BOOL_PARAM_FLAG(white_noise, true, "Add  Gaussian Noise");
87 
88 // Distortion to apply to the image.
89 static BOOL_PARAM_FLAG(smooth_noise, true, "Smoothen Noise");
90 
91 // Distortion to apply to the image.
92 static BOOL_PARAM_FLAG(blur, true, "Blur the image");
93 
94 #if 0
95 
96 // Distortion to apply to the image.
97 static BOOL_PARAM_FLAG(perspective, false, "Generate Perspective Distortion");
98 
99 // Distortion to apply to the image.
100 static INT_PARAM_FLAG(box_reduction, 0, "Integer reduction factor box_scale");
101 
102 #endif
103 
104 // Output image resolution.
105 static INT_PARAM_FLAG(resolution, 300, "Pixels per inch");
106 
107 // Width of output image (in pixels).
108 static INT_PARAM_FLAG(xsize, 3600, "Width of output image");
109 
110 // Max height of output image (in pixels).
111 static INT_PARAM_FLAG(ysize, 4800, "Height of output image");
112 
113 // Max number of pages to produce.
114 static INT_PARAM_FLAG(max_pages, 0, "Maximum number of pages to output (0=unlimited)");
115 
116 // Margin around text (in pixels).
117 static INT_PARAM_FLAG(margin, 100, "Margin round edges of image");
118 
119 // Size of text (in points).
120 static INT_PARAM_FLAG(ptsize, 12, "Size of printed text");
121 
122 // Inter-character space (in ems).
123 static DOUBLE_PARAM_FLAG(char_spacing, 0, "Inter-character space in ems");
124 
125 // Sets the probability (value in [0, 1]) of starting to render a word with an
126 // underline. Words are assumed to be space-delimited.
127 static DOUBLE_PARAM_FLAG(underline_start_prob, 0,
128                          "Fraction of words to underline (value in [0,1])");
129 // Set the probability (value in [0, 1]) of continuing a started underline to
130 // the next word.
131 static DOUBLE_PARAM_FLAG(underline_continuation_prob, 0,
132                          "Fraction of words to underline (value in [0,1])");
133 
134 // Inter-line space (in pixels).
135 static INT_PARAM_FLAG(leading, 12, "Inter-line space (in pixels)");
136 
137 // Layout and glyph orientation on rendering.
138 static STRING_PARAM_FLAG(writing_mode, "horizontal",
139                          "Specify one of the following writing"
140                          " modes.\n"
141                          "'horizontal' : Render regular horizontal text. (default)\n"
142                          "'vertical' : Render vertical text. Glyph orientation is"
143                          " selected by Pango.\n"
144                          "'vertical-upright' : Render vertical text. Glyph "
145                          " orientation is set to be upright.");
146 
147 static INT_PARAM_FLAG(box_padding, 0, "Padding around produced bounding boxes");
148 
149 static BOOL_PARAM_FLAG(strip_unrenderable_words, true,
150                        "Remove unrenderable words from source text");
151 
152 // Font name.
153 static STRING_PARAM_FLAG(font, "Arial", "Font description name to use");
154 
155 static BOOL_PARAM_FLAG(ligatures, false, "Rebuild and render ligatures");
156 
157 static BOOL_PARAM_FLAG(find_fonts, false, "Search for all fonts that can render the text");
158 static BOOL_PARAM_FLAG(render_per_font, true,
159                        "If find_fonts==true, render each font to its own image. "
160                        "Image filenames are of the form output_name.font_name.tif");
161 static DOUBLE_PARAM_FLAG(min_coverage, 1.0,
162                          "If find_fonts==true, the minimum coverage the font has of "
163                          "the characters in the text file to include it, between "
164                          "0 and 1.");
165 
166 static BOOL_PARAM_FLAG(list_available_fonts, false, "List available fonts and quit.");
167 
168 static BOOL_PARAM_FLAG(render_ngrams, false,
169                        "Put each space-separated entity from the"
170                        " input file into one bounding box. The ngrams in the input"
171                        " file will be randomly permuted before rendering (so that"
172                        " there is sufficient variety of characters on each line).");
173 
174 static BOOL_PARAM_FLAG(output_word_boxes, false,
175                        "Output word bounding boxes instead of character boxes. "
176                        "This is used for Cube training, and implied by "
177                        "--render_ngrams.");
178 
179 static STRING_PARAM_FLAG(unicharset_file, "",
180                          "File with characters in the unicharset. If --render_ngrams"
181                          " is true and --unicharset_file is specified, ngrams with"
182                          " characters that are not in unicharset will be omitted");
183 
184 static BOOL_PARAM_FLAG(bidirectional_rotation, false, "Rotate the generated characters both ways.");
185 
186 static BOOL_PARAM_FLAG(only_extract_font_properties, false,
187                        "Assumes that the input file contains a list of ngrams. Renders"
188                        " each ngram, extracts spacing properties and records them in"
189                        " output_base/[font_name].fontinfo file.");
190 
191 // Use these flags to output zero-padded, square individual character images
192 static BOOL_PARAM_FLAG(output_individual_glyph_images, false,
193                        "If true also outputs individual character images");
194 static INT_PARAM_FLAG(glyph_resized_size, 0,
195                       "Each glyph is square with this side length in pixels");
196 static INT_PARAM_FLAG(glyph_num_border_pixels_to_pad, 0,
197                       "Final_size=glyph_resized_size+2*glyph_num_border_pixels_to_pad");
198 
199 namespace tesseract {
200 
201 struct SpacingProperties {
SpacingPropertiestesseract::SpacingProperties202   SpacingProperties() : x_gap_before(0), x_gap_after(0) {}
SpacingPropertiestesseract::SpacingProperties203   SpacingProperties(int b, int a) : x_gap_before(b), x_gap_after(a) {}
204   // These values are obtained from FT_Glyph_Metrics struct
205   // used by the FreeType font engine.
206   int x_gap_before; // horizontal x bearing
207   int x_gap_after;  // horizontal advance - x_gap_before - width
208   std::map<std::string, int> kerned_x_gaps;
209 };
210 
IsWhitespaceBox(const BoxChar * boxchar)211 static bool IsWhitespaceBox(const BoxChar *boxchar) {
212   return (boxchar->box() == nullptr || SpanUTF8Whitespace(boxchar->ch().c_str()));
213 }
214 
StringReplace(const std::string & in,const std::string & oldsub,const std::string & newsub)215 static std::string StringReplace(const std::string &in, const std::string &oldsub,
216                                  const std::string &newsub) {
217   std::string out;
218   size_t start_pos = 0, pos;
219   while ((pos = in.find(oldsub, start_pos)) != std::string::npos) {
220     out.append(in.data() + start_pos, pos - start_pos);
221     out.append(newsub.data(), newsub.length());
222     start_pos = pos + oldsub.length();
223   }
224   out.append(in.data() + start_pos, in.length() - start_pos);
225   return out;
226 }
227 
228 // Assumes that each word (whitespace-separated entity) in text is a bigram.
229 // Renders the bigrams and calls FontInfo::GetSpacingProperties() to
230 // obtain spacing information. Produces the output .fontinfo file with a line
231 // per unichar of the form:
232 // unichar space_before space_after kerned1 kerned_space1 kerned2 ...
233 // Fox example, if unichar "A" has spacing of 0 pixels before and -1 pixels
234 // after, is kerned with "V" resulting in spacing of "AV" to be -7 and kerned
235 // with "T", such that "AT" has spacing of -5, the entry/line for unichar "A"
236 // in .fontinfo file will be:
237 // A 0 -1 T -5 V -7
ExtractFontProperties(const std::string & utf8_text,StringRenderer * render,const std::string & output_base)238 static void ExtractFontProperties(const std::string &utf8_text, StringRenderer *render,
239                                   const std::string &output_base) {
240   std::map<std::string, SpacingProperties> spacing_map;
241   std::map<std::string, SpacingProperties>::iterator spacing_map_it0;
242   std::map<std::string, SpacingProperties>::iterator spacing_map_it1;
243   int x_bearing, x_advance;
244   int len = utf8_text.length();
245   int offset = 0;
246   const char *text = utf8_text.c_str();
247   while (offset < len) {
248     offset += render->RenderToImage(text + offset, strlen(text + offset), nullptr);
249     const std::vector<BoxChar *> &boxes = render->GetBoxes();
250 
251     // If the page break split a bigram, correct the offset so we try the bigram
252     // on the next iteration.
253     if (boxes.size() > 2 && !IsWhitespaceBox(boxes[boxes.size() - 1]) &&
254         IsWhitespaceBox(boxes[boxes.size() - 2])) {
255       if (boxes.size() > 3) {
256         tprintf("WARNING: Adjusting to bad page break after '%s%s'\n",
257                 boxes[boxes.size() - 4]->ch().c_str(), boxes[boxes.size() - 3]->ch().c_str());
258       }
259       offset -= boxes[boxes.size() - 1]->ch().size();
260     }
261 
262     for (size_t b = 0; b < boxes.size(); b += 2) {
263       while (b < boxes.size() && IsWhitespaceBox(boxes[b])) {
264         ++b;
265       }
266       if (b + 1 >= boxes.size()) {
267         break;
268       }
269       const std::string &ch0 = boxes[b]->ch();
270       // We encountered a ligature. This happens in at least two scenarios:
271       // One is when the rendered bigram forms a grapheme cluster (eg. the
272       // second character in the bigram is a combining vowel), in which case we
273       // correctly output only one bounding box.
274       // A second far less frequent case is when caused some fonts like 'DejaVu
275       // Sans Ultra-Light' force Pango to render a ligatured character even if
276       // the input consists of the separated characters.  NOTE(ranjith): As per
277       // behdad@ this is not currently controllable at the level of the Pango
278       // API.
279       // The most frequent of all is a single character "word" made by the CJK
280       // segmenter.
281       // Safeguard against these cases here by just skipping the bigram.
282       if (IsWhitespaceBox(boxes[b + 1])) {
283         continue;
284       }
285       int xgap = (boxes[b + 1]->box()->x - (boxes[b]->box()->x + boxes[b]->box()->w));
286       spacing_map_it0 = spacing_map.find(ch0);
287       int ok_count = 0;
288       if (spacing_map_it0 == spacing_map.end() &&
289           render->font().GetSpacingProperties(ch0, &x_bearing, &x_advance)) {
290         spacing_map[ch0] = SpacingProperties(x_bearing, x_advance - x_bearing - boxes[b]->box()->w);
291         spacing_map_it0 = spacing_map.find(ch0);
292         ++ok_count;
293       }
294       const std::string &ch1 = boxes[b + 1]->ch();
295       tlog(3, "%s%s\n", ch0.c_str(), ch1.c_str());
296       spacing_map_it1 = spacing_map.find(ch1);
297       if (spacing_map_it1 == spacing_map.end() &&
298           render->font().GetSpacingProperties(ch1, &x_bearing, &x_advance)) {
299         spacing_map[ch1] =
300             SpacingProperties(x_bearing, x_advance - x_bearing - boxes[b + 1]->box()->w);
301         spacing_map_it1 = spacing_map.find(ch1);
302         ++ok_count;
303       }
304       if (ok_count == 2 &&
305           xgap != (spacing_map_it0->second.x_gap_after + spacing_map_it1->second.x_gap_before)) {
306         spacing_map_it0->second.kerned_x_gaps[ch1] = xgap;
307       }
308     }
309     render->ClearBoxes();
310   }
311   std::string output_string;
312   const int kBufSize = 1024;
313   char buf[kBufSize];
314   snprintf(buf, kBufSize, "%d\n", static_cast<int>(spacing_map.size()));
315   output_string.append(buf);
316   std::map<std::string, SpacingProperties>::const_iterator spacing_map_it;
317   for (spacing_map_it = spacing_map.begin(); spacing_map_it != spacing_map.end();
318        ++spacing_map_it) {
319     snprintf(buf, kBufSize, "%s %d %d %d", spacing_map_it->first.c_str(),
320              spacing_map_it->second.x_gap_before, spacing_map_it->second.x_gap_after,
321              static_cast<int>(spacing_map_it->second.kerned_x_gaps.size()));
322     output_string.append(buf);
323     std::map<std::string, int>::const_iterator kern_it;
324     for (kern_it = spacing_map_it->second.kerned_x_gaps.begin();
325          kern_it != spacing_map_it->second.kerned_x_gaps.end(); ++kern_it) {
326       snprintf(buf, kBufSize, " %s %d", kern_it->first.c_str(), kern_it->second);
327       output_string.append(buf);
328     }
329     output_string.append("\n");
330   }
331   File::WriteStringToFileOrDie(output_string, output_base + ".fontinfo");
332 }
333 
MakeIndividualGlyphs(Image pix,const std::vector<BoxChar * > & vbox,const int input_tiff_page)334 static bool MakeIndividualGlyphs(Image pix, const std::vector<BoxChar *> &vbox,
335                                  const int input_tiff_page) {
336   // If checks fail, return false without exiting text2image
337   if (!pix) {
338     tprintf("ERROR: MakeIndividualGlyphs(): Input Pix* is nullptr\n");
339     return false;
340   } else if (FLAGS_glyph_resized_size <= 0) {
341     tprintf("ERROR: --glyph_resized_size must be positive\n");
342     return false;
343   } else if (FLAGS_glyph_num_border_pixels_to_pad < 0) {
344     tprintf("ERROR: --glyph_num_border_pixels_to_pad must be 0 or positive\n");
345     return false;
346   }
347 
348   const int n_boxes = vbox.size();
349   int n_boxes_saved = 0;
350   int current_tiff_page = 0;
351   int y_previous = 0;
352   static int glyph_count = 0;
353   for (int i = 0; i < n_boxes; i++) {
354     // Get one bounding box
355     Box *b = vbox[i]->mutable_box();
356     if (!b) {
357       continue;
358     }
359     const int x = b->x;
360     const int y = b->y;
361     const int w = b->w;
362     const int h = b->h;
363     // Check present tiff page (for multipage tiff)
364     if (y < y_previous - pixGetHeight(pix) / 10) {
365       tprintf("ERROR: Wrap-around encountered, at i=%d\n", i);
366       current_tiff_page++;
367     }
368     if (current_tiff_page < input_tiff_page) {
369       continue;
370     } else if (current_tiff_page > input_tiff_page) {
371       break;
372     }
373     // Check box validity
374     if (x < 0 || y < 0 || (x + w - 1) >= pixGetWidth(pix) || (y + h - 1) >= pixGetHeight(pix)) {
375       tprintf(
376           "ERROR: MakeIndividualGlyphs(): Index out of range, at i=%d"
377           " (x=%d, y=%d, w=%d, h=%d\n)",
378           i, x, y, w, h);
379       continue;
380     } else if (w < FLAGS_glyph_num_border_pixels_to_pad &&
381                h < FLAGS_glyph_num_border_pixels_to_pad) {
382       tprintf("ERROR: Input image too small to be a character, at i=%d\n", i);
383       continue;
384     }
385     // Crop the boxed character
386     Image pix_glyph = pixClipRectangle(pix, b, nullptr);
387     if (!pix_glyph) {
388       tprintf("ERROR: MakeIndividualGlyphs(): Failed to clip, at i=%d\n", i);
389       continue;
390     }
391     // Resize to square
392     Image pix_glyph_sq =
393         pixScaleToSize(pix_glyph, FLAGS_glyph_resized_size, FLAGS_glyph_resized_size);
394     if (!pix_glyph_sq) {
395       tprintf("ERROR: MakeIndividualGlyphs(): Failed to resize, at i=%d\n", i);
396       continue;
397     }
398     // Zero-pad
399     Image pix_glyph_sq_pad = pixAddBorder(pix_glyph_sq, FLAGS_glyph_num_border_pixels_to_pad, 0);
400     if (!pix_glyph_sq_pad) {
401       tprintf("ERROR: MakeIndividualGlyphs(): Failed to zero-pad, at i=%d\n", i);
402       continue;
403     }
404     // Write out
405     Image pix_glyph_sq_pad_8 = pixConvertTo8(pix_glyph_sq_pad, false);
406     char filename[1024];
407     snprintf(filename, 1024, "%s_%d.jpg", FLAGS_outputbase.c_str(), glyph_count++);
408     if (pixWriteJpeg(filename, pix_glyph_sq_pad_8, 100, 0)) {
409       tprintf(
410           "ERROR: MakeIndividualGlyphs(): Failed to write JPEG to %s,"
411           " at i=%d\n",
412           filename, i);
413       continue;
414     }
415 
416     pix_glyph.destroy();
417     pix_glyph_sq.destroy();
418     pix_glyph_sq_pad.destroy();
419     pix_glyph_sq_pad_8.destroy();
420     n_boxes_saved++;
421     y_previous = y;
422   }
423   if (n_boxes_saved == 0) {
424     return false;
425   } else {
426     tprintf("Total number of characters saved = %d\n", n_boxes_saved);
427     return true;
428   }
429 }
430 } // namespace tesseract
431 
432 using tesseract::DegradeImage;
433 using tesseract::ExtractFontProperties;
434 using tesseract::File;
435 using tesseract::FontUtils;
436 using tesseract::SpanUTF8NotWhitespace;
437 using tesseract::SpanUTF8Whitespace;
438 using tesseract::StringRenderer;
439 
Main()440 static int Main() {
441   if (FLAGS_list_available_fonts) {
442     const std::vector<std::string> &all_fonts = FontUtils::ListAvailableFonts();
443     for (unsigned int i = 0; i < all_fonts.size(); ++i) {
444       // Remove trailing comma: pango-font-description-to-string adds a comma
445       // to some fonts.
446       // See https://github.com/tesseract-ocr/tesseract/issues/408
447       std::string font_name(all_fonts[i].c_str());
448       if (font_name.back() == ',') {
449         font_name.pop_back();
450       }
451       printf("%3u: %s\n", i, font_name.c_str());
452       ASSERT_HOST_MSG(FontUtils::IsAvailableFont(all_fonts[i].c_str()),
453                       "Font %s is unrecognized.\n", all_fonts[i].c_str());
454     }
455     return EXIT_SUCCESS;
456   }
457 
458   // Check validity of input flags.
459   if (FLAGS_text.empty()) {
460     tprintf("'--text' option is missing!\n");
461     exit(1);
462   }
463   if (FLAGS_outputbase.empty()) {
464     tprintf("'--outputbase' option is missing!\n");
465     exit(1);
466   }
467   if (!FLAGS_unicharset_file.empty() && FLAGS_render_ngrams) {
468     tprintf("Use '--unicharset_file' only if '--render_ngrams' is set.\n");
469     exit(1);
470   }
471 
472   std::string font_name = FLAGS_font.c_str();
473   if (!FLAGS_find_fonts && !FontUtils::IsAvailableFont(font_name.c_str())) {
474     font_name += ',';
475     std::string pango_name;
476     if (!FontUtils::IsAvailableFont(font_name.c_str(), &pango_name)) {
477       tprintf("Could not find font named '%s'.\n", FLAGS_font.c_str());
478       if (!pango_name.empty()) {
479         tprintf("Pango suggested font '%s'.\n", pango_name.c_str());
480       }
481       tprintf("Please correct --font arg.\n");
482       exit(1);
483     }
484   }
485 
486   if (FLAGS_render_ngrams) {
487     FLAGS_output_word_boxes = true;
488   }
489 
490   char font_desc_name[1024];
491   snprintf(font_desc_name, 1024, "%s %d", font_name.c_str(), static_cast<int>(FLAGS_ptsize));
492 
493   StringRenderer render(font_desc_name, FLAGS_xsize, FLAGS_ysize);
494   render.set_add_ligatures(FLAGS_ligatures);
495   render.set_leading(FLAGS_leading);
496   render.set_resolution(FLAGS_resolution);
497   render.set_char_spacing(FLAGS_char_spacing * FLAGS_ptsize);
498   render.set_h_margin(FLAGS_margin);
499   render.set_v_margin(FLAGS_margin);
500   render.set_output_word_boxes(FLAGS_output_word_boxes);
501   render.set_box_padding(FLAGS_box_padding);
502   render.set_strip_unrenderable_words(FLAGS_strip_unrenderable_words);
503   render.set_underline_start_prob(FLAGS_underline_start_prob);
504   render.set_underline_continuation_prob(FLAGS_underline_continuation_prob);
505 
506   // Set text rendering orientation and their forms.
507   if (FLAGS_writing_mode == "horizontal") {
508     // Render regular horizontal text (default).
509     render.set_vertical_text(false);
510     render.set_gravity_hint_strong(false);
511     render.set_render_fullwidth_latin(false);
512   } else if (FLAGS_writing_mode == "vertical") {
513     // Render vertical text. Glyph orientation is selected by Pango.
514     render.set_vertical_text(true);
515     render.set_gravity_hint_strong(false);
516     render.set_render_fullwidth_latin(false);
517   } else if (FLAGS_writing_mode == "vertical-upright") {
518     // Render vertical text. Glyph orientation is set to be upright.
519     // Also Basic Latin characters are converted to their fullwidth forms
520     // on rendering, since fullwidth Latin characters are well designed to fit
521     // vertical text lines, while .box files store halfwidth Basic Latin
522     // unichars.
523     render.set_vertical_text(true);
524     render.set_gravity_hint_strong(true);
525     render.set_render_fullwidth_latin(true);
526   } else {
527     tprintf("Invalid writing mode: %s\n", FLAGS_writing_mode.c_str());
528     exit(1);
529   }
530 
531   std::string src_utf8;
532   // This c_str is NOT redundant!
533   if (!File::ReadFileToString(FLAGS_text.c_str(), &src_utf8)) {
534     tprintf("Failed to read file: %s\n", FLAGS_text.c_str());
535     exit(1);
536   }
537 
538   // Remove the unicode mark if present.
539   if (strncmp(src_utf8.c_str(), "\xef\xbb\xbf", 3) == 0) {
540     src_utf8.erase(0, 3);
541   }
542   tlog(1, "Render string of size %zu\n", src_utf8.length());
543 
544   if (FLAGS_render_ngrams || FLAGS_only_extract_font_properties) {
545     // Try to preserve behavior of old text2image by expanding inter-word
546     // spaces by a factor of 4.
547     const std::string kSeparator = FLAGS_render_ngrams ? "    " : " ";
548     // Also restrict the number of characters per line to try and avoid
549     // line-breaking in the middle of words like "-A", "R$" etc. which are
550     // otherwise allowed by the standard unicode line-breaking rules.
551     const unsigned int kCharsPerLine = (FLAGS_ptsize > 20) ? 50 : 100;
552     std::string rand_utf8;
553     UNICHARSET unicharset;
554     if (FLAGS_render_ngrams && !FLAGS_unicharset_file.empty() &&
555         !unicharset.load_from_file(FLAGS_unicharset_file.c_str())) {
556       tprintf("Failed to load unicharset from file %s\n", FLAGS_unicharset_file.c_str());
557       exit(1);
558     }
559 
560     // If we are rendering ngrams that will be OCRed later, shuffle them so that
561     // tesseract does not have difficulties finding correct baseline, word
562     // spaces, etc.
563     const char *str8 = src_utf8.c_str();
564     int len = src_utf8.length();
565     int step;
566     std::vector<std::pair<int, int>> offsets;
567     int offset = SpanUTF8Whitespace(str8);
568     while (offset < len) {
569       step = SpanUTF8NotWhitespace(str8 + offset);
570       offsets.emplace_back(offset, step);
571       offset += step;
572       offset += SpanUTF8Whitespace(str8 + offset);
573     }
574     if (FLAGS_render_ngrams) {
575       std::seed_seq seed{kRandomSeed};
576       std::mt19937 random_gen(seed);
577       std::shuffle(offsets.begin(), offsets.end(), random_gen);
578     }
579 
580     for (size_t i = 0, line = 1; i < offsets.size(); ++i) {
581       const char *curr_pos = str8 + offsets[i].first;
582       int ngram_len = offsets[i].second;
583       // Skip words that contain characters not in found in unicharset.
584       std::string cleaned = UNICHARSET::CleanupString(curr_pos, ngram_len);
585       if (!FLAGS_unicharset_file.empty() &&
586           !unicharset.encodable_string(cleaned.c_str(), nullptr)) {
587         continue;
588       }
589       rand_utf8.append(curr_pos, ngram_len);
590       if (rand_utf8.length() > line * kCharsPerLine) {
591         rand_utf8.append(" \n");
592         ++line;
593         if (line & 0x1) {
594           rand_utf8.append(kSeparator);
595         }
596       } else {
597         rand_utf8.append(kSeparator);
598       }
599     }
600     tlog(1, "Rendered ngram string of size %zu\n", rand_utf8.length());
601     src_utf8.swap(rand_utf8);
602   }
603   if (FLAGS_only_extract_font_properties) {
604     tprintf("Extracting font properties only\n");
605     ExtractFontProperties(src_utf8, &render, FLAGS_outputbase.c_str());
606     tprintf("Done!\n");
607     return 0;
608   }
609 
610   int im = 0;
611   std::vector<float> page_rotation;
612   const char *to_render_utf8 = src_utf8.c_str();
613 
614   tesseract::TRand randomizer;
615   randomizer.set_seed(kRandomSeed);
616   std::vector<std::string> font_names;
617   // We use a two pass mechanism to rotate images in both direction.
618   // The first pass(0) will rotate the images in random directions and
619   // the second pass(1) will mirror those rotations.
620   int num_pass = FLAGS_bidirectional_rotation ? 2 : 1;
621   for (int pass = 0; pass < num_pass; ++pass) {
622     int page_num = 0;
623     std::string font_used;
624     for (size_t offset = 0;
625          offset < strlen(to_render_utf8) && (FLAGS_max_pages == 0 || page_num < FLAGS_max_pages);
626          ++im, ++page_num) {
627       tlog(1, "Starting page %d\n", im);
628       Image pix = nullptr;
629       if (FLAGS_find_fonts) {
630         offset += render.RenderAllFontsToImage(FLAGS_min_coverage, to_render_utf8 + offset,
631                                                strlen(to_render_utf8 + offset), &font_used, &pix);
632       } else {
633         offset +=
634             render.RenderToImage(to_render_utf8 + offset, strlen(to_render_utf8 + offset), &pix);
635       }
636       if (pix != nullptr) {
637         float rotation = 0;
638         if (pass == 1) {
639           // Pass 2, do mirror rotation.
640           rotation = -1 * page_rotation[page_num];
641         }
642         if (FLAGS_degrade_image) {
643           pix = DegradeImage(pix, FLAGS_exposure, &randomizer,
644                              FLAGS_rotate_image ? &rotation : nullptr);
645         }
646         if (FLAGS_distort_image) {
647           // TODO: perspective is set to false and box_reduction to 1.
648           pix = PrepareDistortedPix(pix, false, FLAGS_invert, FLAGS_white_noise, FLAGS_smooth_noise,
649                                     FLAGS_blur, 1, &randomizer, nullptr);
650         }
651         render.RotatePageBoxes(rotation);
652 
653         if (pass == 0) {
654           // Pass 1, rotate randomly and store the rotation..
655           page_rotation.push_back(rotation);
656         }
657 
658         Image gray_pix = pixConvertTo8(pix, false);
659         pix.destroy();
660         Image binary = pixThresholdToBinary(gray_pix, 128);
661         gray_pix.destroy();
662         char tiff_name[1024];
663         if (FLAGS_find_fonts) {
664           if (FLAGS_render_per_font) {
665             std::string fontname_for_file = tesseract::StringReplace(font_used, " ", "_");
666             snprintf(tiff_name, 1024, "%s.%s.tif", FLAGS_outputbase.c_str(),
667                      fontname_for_file.c_str());
668             pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, "w");
669             tprintf("Rendered page %d to file %s\n", im, tiff_name);
670           } else {
671             font_names.push_back(font_used);
672           }
673         } else {
674           snprintf(tiff_name, 1024, "%s.tif", FLAGS_outputbase.c_str());
675           pixWriteTiff(tiff_name, binary, IFF_TIFF_G4, im == 0 ? "w" : "a");
676           tprintf("Rendered page %d to file %s\n", im, tiff_name);
677         }
678         // Make individual glyphs
679         if (FLAGS_output_individual_glyph_images) {
680           if (!MakeIndividualGlyphs(binary, render.GetBoxes(), im)) {
681             tprintf("ERROR: Individual glyphs not saved\n");
682           }
683         }
684         binary.destroy();
685       }
686       if (FLAGS_find_fonts && offset != 0) {
687         // We just want a list of names, or some sample images so we don't need
688         // to render more than the first page of the text.
689         break;
690       }
691     }
692   }
693   if (!FLAGS_find_fonts) {
694     std::string box_name = FLAGS_outputbase.c_str();
695     box_name += ".box";
696     render.WriteAllBoxes(box_name);
697   } else if (!FLAGS_render_per_font && !font_names.empty()) {
698     std::string filename = FLAGS_outputbase.c_str();
699     filename += ".fontlist.txt";
700     FILE *fp = fopen(filename.c_str(), "wb");
701     if (fp == nullptr) {
702       tprintf("Failed to create output font list %s\n", filename.c_str());
703     } else {
704       for (auto &font_name : font_names) {
705         fprintf(fp, "%s\n", font_name.c_str());
706       }
707       fclose(fp);
708     }
709   }
710 
711   return 0;
712 }
713 
main(int argc,char ** argv)714 int main(int argc, char **argv) {
715   // Respect environment variable. could be:
716   // fc (fontconfig), win32, and coretext
717   // If not set force fontconfig for Mac OS.
718   // See https://github.com/tesseract-ocr/tesseract/issues/736
719   char *backend;
720   backend = getenv("PANGOCAIRO_BACKEND");
721   if (backend == nullptr) {
722     static char envstring[] = "PANGOCAIRO_BACKEND=fc";
723     putenv(envstring);
724   } else {
725     printf(
726         "Using '%s' as pango cairo backend based on environment "
727         "variable.\n",
728         backend);
729   }
730   tesseract::CheckSharedLibraryVersion();
731   if (argc > 1) {
732     if ((strcmp(argv[1], "-v") == 0) || (strcmp(argv[1], "--version") == 0)) {
733       FontUtils::PangoFontTypeInfo();
734       printf("Pango version: %s\n", pango_version_string());
735     }
736   }
737   tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
738   return Main();
739 }
740