1 ///////////////////////////////////////////////////////////////////////
2 // File:        fontinfo.h
3 // Description: Font information classes abstracted from intproto.h/cpp.
4 // Author:      rays@google.com (Ray Smith)
5 //
6 // (C) Copyright 2011, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
17 ///////////////////////////////////////////////////////////////////////
18 
19 #ifndef TESSERACT_CCSTRUCT_FONTINFO_H_
20 #define TESSERACT_CCSTRUCT_FONTINFO_H_
21 
22 #include "errcode.h"
23 
24 #include <tesseract/unichar.h>
25 #include "genericvector.h"
26 
27 #include <cstdint> // for uint16_t, uint32_t
28 #include <cstdio>  // for FILE
29 #include <vector>
30 
31 namespace tesseract {
32 
33 template <typename T>
34 class UnicityTable;
35 
36 // Simple struct to hold a font and a score. The scores come from the low-level
37 // integer matcher, so they are in the uint16_t range. Fonts are an index to
38 // fontinfo_table.
39 // These get copied around a lot, so best to keep them small.
40 struct ScoredFont {
ScoredFontScoredFont41   ScoredFont() : fontinfo_id(-1), score(0) {}
ScoredFontScoredFont42   ScoredFont(int font_id, uint16_t classifier_score)
43       : fontinfo_id(font_id), score(classifier_score) {}
44 
45   // Index into fontinfo table, but inside the classifier, may be a shapetable
46   // index.
47   int32_t fontinfo_id;
48   // Raw score from the low-level classifier.
49   uint16_t score;
50 };
51 
52 // Struct for information about spacing between characters in a particular font.
53 struct FontSpacingInfo {
54   int16_t x_gap_before;
55   int16_t x_gap_after;
56   std::vector<UNICHAR_ID> kerned_unichar_ids;
57   std::vector<int16_t> kerned_x_gaps;
58 };
59 
60 /*
61  * font_properties contains properties about boldness, italicness, fixed pitch,
62  * serif, fraktur
63  */
64 struct FontInfo {
FontInfoFontInfo65   FontInfo() : name(nullptr), properties(0), universal_id(0), spacing_vec(nullptr) {}
66   ~FontInfo() = default;
67 
68   bool operator==(const FontInfo &rhs) const {
69     return strcmp(name, rhs.name) == 0;
70   }
71 
72   // Writes to the given file. Returns false in case of error.
73   bool Serialize(FILE *fp) const;
74   // Reads from the given file. Returns false in case of error.
75   // If swap is true, assumes a big/little-endian swap is needed.
76   bool DeSerialize(TFile *fp);
77 
78   // Reserves unicharset_size spots in spacing_vec.
init_spacingFontInfo79   void init_spacing(int unicharset_size) {
80     spacing_vec = new std::vector<FontSpacingInfo *>(unicharset_size);
81   }
82   // Adds the given pointer to FontSpacingInfo to spacing_vec member
83   // (FontInfo class takes ownership of the pointer).
84   // Note: init_spacing should be called before calling this function.
add_spacingFontInfo85   void add_spacing(UNICHAR_ID uch_id, FontSpacingInfo *spacing_info) {
86     ASSERT_HOST(static_cast<size_t>(uch_id) < spacing_vec->size());
87     (*spacing_vec)[uch_id] = spacing_info;
88   }
89 
90   // Returns the pointer to FontSpacingInfo for the given UNICHAR_ID.
get_spacingFontInfo91   const FontSpacingInfo *get_spacing(UNICHAR_ID uch_id) const {
92     return (spacing_vec == nullptr || spacing_vec->size() <= static_cast<size_t>(uch_id)) ? nullptr
93                                                                      : (*spacing_vec)[uch_id];
94   }
95 
96   // Fills spacing with the value of the x gap expected between the two given
97   // UNICHAR_IDs. Returns true on success.
get_spacingFontInfo98   bool get_spacing(UNICHAR_ID prev_uch_id, UNICHAR_ID uch_id, int *spacing) const {
99     const FontSpacingInfo *prev_fsi = this->get_spacing(prev_uch_id);
100     const FontSpacingInfo *fsi = this->get_spacing(uch_id);
101     if (prev_fsi == nullptr || fsi == nullptr) {
102       return false;
103     }
104     size_t i = 0;
105     for (; i < prev_fsi->kerned_unichar_ids.size(); ++i) {
106       if (prev_fsi->kerned_unichar_ids[i] == uch_id) {
107         break;
108       }
109     }
110     if (i < prev_fsi->kerned_unichar_ids.size()) {
111       *spacing = prev_fsi->kerned_x_gaps[i];
112     } else {
113       *spacing = prev_fsi->x_gap_after + fsi->x_gap_before;
114     }
115     return true;
116   }
117 
is_italicFontInfo118   bool is_italic() const {
119     return properties & 1;
120   }
is_boldFontInfo121   bool is_bold() const {
122     return (properties & 2) != 0;
123   }
is_fixed_pitchFontInfo124   bool is_fixed_pitch() const {
125     return (properties & 4) != 0;
126   }
is_serifFontInfo127   bool is_serif() const {
128     return (properties & 8) != 0;
129   }
is_frakturFontInfo130   bool is_fraktur() const {
131     return (properties & 16) != 0;
132   }
133 
134   char *name;
135   uint32_t properties;
136   // The universal_id is a field reserved for the initialization process
137   // to assign a unique id number to all fonts loaded for the current
138   // combination of languages. This id will then be returned by
139   // ResultIterator::WordFontAttributes.
140   int32_t universal_id;
141   // Horizontal spacing between characters (indexed by UNICHAR_ID).
142   std::vector<FontSpacingInfo *> *spacing_vec;
143 };
144 
145 // Every class (character) owns a FontSet that represents all the fonts that can
146 // render this character.
147 // Since almost all the characters from the same script share the same set of
148 // fonts, the sets are shared over multiple classes (see
149 // Classify::fontset_table_). Thus, a class only store an id to a set.
150 // Because some fonts cannot render just one character of a set, there are a
151 // lot of FontSet that differ only by one font. Rather than storing directly
152 // the FontInfo in the FontSet structure, it's better to share FontInfos among
153 // FontSets (Classify::fontinfo_table_).
154 using FontSet = std::vector<int>;
155 
156 // Class that adds a bit of functionality on top of GenericVector to
157 // implement a table of FontInfo that replaces UniCityTable<FontInfo>.
158 // TODO(rays) change all references once all existing traineddata files
159 // are replaced.
160 class FontInfoTable : public GenericVector<FontInfo> {
161 public:
162   TESS_API // when you remove inheritance from GenericVector, move this on
163   // class level
164   FontInfoTable();
165   TESS_API
166   ~FontInfoTable();
167 
168   // Writes to the given file. Returns false in case of error.
169   TESS_API
170   bool Serialize(FILE *fp) const;
171   // Reads from the given file. Returns false in case of error.
172   // If swap is true, assumes a big/little-endian swap is needed.
173   TESS_API
174   bool DeSerialize(TFile *fp);
175 
176   // Returns true if the given set of fonts includes one with the same
177   // properties as font_id.
178   TESS_API
179   bool SetContainsFontProperties(int font_id, const std::vector<ScoredFont> &font_set) const;
180   // Returns true if the given set of fonts includes multiple properties.
181   TESS_API
182   bool SetContainsMultipleFontProperties(const std::vector<ScoredFont> &font_set) const;
183 
184   // Moves any non-empty FontSpacingInfo entries from other to this.
185   TESS_API
186   void MoveSpacingInfoFrom(FontInfoTable *other);
187   // Moves this to the target unicity table.
188   TESS_API
189   void MoveTo(UnicityTable<FontInfo> *target);
190 };
191 
192 // Deletion callbacks for GenericVector.
193 void FontInfoDeleteCallback(FontInfo f);
194 
195 // Callbacks used by UnicityTable to read/write FontInfo/FontSet structures.
196 bool read_info(TFile *f, FontInfo *fi);
197 bool write_info(FILE *f, const FontInfo &fi);
198 bool read_spacing_info(TFile *f, FontInfo *fi);
199 bool write_spacing_info(FILE *f, const FontInfo &fi);
200 bool write_set(FILE *f, const FontSet &fs);
201 
202 } // namespace tesseract.
203 
204 #endif /* THIRD_PARTY_TESSERACT_CCSTRUCT_FONTINFO_H_ */
205