1 #include "validate_grapheme.h"
2 #include "tprintf.h"
3 #include "unicode/uchar.h" // From libicu
4
5 namespace tesseract {
6
ConsumeGraphemeIfValid()7 bool ValidateGrapheme::ConsumeGraphemeIfValid() {
8 const unsigned num_codes = codes_.size();
9 char32 prev_prev_ch = ' ';
10 char32 prev_ch = ' ';
11 CharClass prev_cc = CharClass::kWhitespace;
12 int num_codes_in_grapheme = 0;
13 while (codes_used_ < num_codes) {
14 CharClass cc = codes_[codes_used_].first;
15 char32 ch = codes_[codes_used_].second;
16 const bool is_combiner = cc == CharClass::kCombiner || cc == CharClass::kVirama;
17 // TODO: Make this code work well with RTL text.
18 // See
19 // https://github.com/tesseract-ocr/tesseract/pull/2266#issuecomment-467114751
20 #if 0
21 // Reject easily detected badly formed sequences.
22 if (prev_cc == CharClass::kWhitespace && is_combiner) {
23 if (report_errors_) tprintf("Word started with a combiner:0x%x\n", ch);
24 return false;
25 }
26 #endif
27 if (prev_cc == CharClass::kVirama && cc == CharClass::kVirama) {
28 if (report_errors_) {
29 tprintf("Two grapheme links in a row:0x%x 0x%x\n", prev_ch, ch);
30 }
31 return false;
32 }
33 if (prev_cc != CharClass::kWhitespace && cc != CharClass::kWhitespace &&
34 IsBadlyFormed(prev_ch, ch)) {
35 return false;
36 }
37 bool prev_is_fwd_combiner = prev_ch == kZeroWidthJoiner || prev_cc == CharClass::kVirama ||
38 (prev_ch == kZeroWidthNonJoiner &&
39 (cc == CharClass::kVirama || prev_prev_ch == kZeroWidthJoiner));
40 if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner) {
41 break;
42 }
43 CodeOnlyToOutput();
44 ++num_codes_in_grapheme;
45 prev_prev_ch = prev_ch;
46 prev_ch = ch;
47 prev_cc = cc;
48 }
49 if (num_codes_in_grapheme > 0) {
50 MultiCodePart(num_codes_in_grapheme);
51 }
52 return true;
53 }
54
UnicodeToCharClass(char32 ch) const55 Validator::CharClass ValidateGrapheme::UnicodeToCharClass(char32 ch) const {
56 if (IsVedicAccent(ch)) {
57 return CharClass::kVedicMark;
58 }
59 // The ZeroWidth[Non]Joiner characters are mapped to kCombiner as they
60 // always combine with the previous character.
61 if (u_hasBinaryProperty(ch, UCHAR_GRAPHEME_LINK)) {
62 return CharClass::kVirama;
63 }
64 if (u_isUWhiteSpace(ch)) {
65 return CharClass::kWhitespace;
66 }
67 // Workaround for Javanese Aksara's Taling, do not label it as a combiner
68 if (ch == 0xa9ba) {
69 return CharClass::kConsonant;
70 }
71 int char_type = u_charType(ch);
72 if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK ||
73 char_type == U_COMBINING_SPACING_MARK || ch == kZeroWidthNonJoiner ||
74 ch == kZeroWidthJoiner) {
75 return CharClass::kCombiner;
76 }
77 return CharClass::kOther;
78 }
79
80 // Helper returns true if the sequence prev_ch,ch is invalid.
IsBadlyFormed(char32 prev_ch,char32 ch)81 bool ValidateGrapheme::IsBadlyFormed(char32 prev_ch, char32 ch) {
82 // Reject badly formed Indic vowels.
83 if (IsBadlyFormedIndicVowel(prev_ch, ch)) {
84 if (report_errors_) {
85 tprintf("Badly formed Indic vowel sequence:0x%x 0x%x\n", prev_ch, ch);
86 }
87 return true;
88 }
89 if (IsBadlyFormedThai(prev_ch, ch)) {
90 if (report_errors_) {
91 tprintf("Badly formed Thai:0x%x 0x%x\n", prev_ch, ch);
92 }
93 return true;
94 }
95 return false;
96 }
97
98 // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
99 // Some vowels in Indic scripts may be analytically decomposed into atomic pairs
100 // of components that are themselves valid unicode symbols. (See Table 12-1 in
101 // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
102 // for examples in Devanagari). The Unicode standard discourages specifying
103 // vowels this way, but they are sometimes encountered in text, probably because
104 // some editors still permit it. Renderers however dislike such pairs, and so
105 // this function may be used to detect their occurrence for removal.
106 // TODO(rays) This function only covers a subset of Indic languages and doesn't
107 // include all rules. Add rules as appropriate to support other languages or
108 // find a way to generalize these existing rules that makes use of the
109 // regularity of the mapping from ISCII to Unicode.
110 /* static */
IsBadlyFormedIndicVowel(char32 prev_ch,char32 ch)111 bool ValidateGrapheme::IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch) {
112 return ((prev_ch == 0x905 && (ch == 0x946 || ch == 0x93E)) || (prev_ch == 0x909 && ch == 0x941) ||
113 (prev_ch == 0x90F && (ch >= 0x945 && ch <= 0x947)) ||
114 (prev_ch == 0x905 && (ch >= 0x949 && ch <= 0x94C)) ||
115 (prev_ch == 0x906 && (ch >= 0x949 && ch <= 0x94C)) ||
116 // Illegal combinations of two dependent Devanagari vowels.
117 (prev_ch == 0x93E && (ch >= 0x945 && ch <= 0x948)) ||
118 // Dependent Devanagari vowels following a virama.
119 (prev_ch == 0x94D && (ch >= 0x93E && ch <= 0x94C)) ||
120 // Bengali vowels (Table 9-5, pg 313)
121 (prev_ch == 0x985 && ch == 0x9BE) ||
122 // Telugu vowels (Table 9-19, pg 331)
123 (prev_ch == 0xC12 && (ch == 0xC55 || ch == 0xC4C)) ||
124 // Kannada vowels (Table 9-20, pg 332)
125 (prev_ch == 0xC92 && ch == 0xCCC));
126 }
127
128 // Helper returns true if ch is a Thai consonant.
IsThaiConsonant(char32 ch)129 static bool IsThaiConsonant(char32 ch) {
130 return 0xe01 <= ch && ch <= 0xe2e;
131 }
132
133 // Helper returns true is ch is a before-consonant vowel.
IsThaiBeforeConsonantVowel(char32 ch)134 static bool IsThaiBeforeConsonantVowel(char32 ch) {
135 return 0xe40 <= ch && ch <= 0xe44;
136 }
137
138 // Helper returns true if ch is a Thai tone mark.
IsThaiToneMark(char32 ch)139 static bool IsThaiToneMark(char32 ch) {
140 return 0xe48 <= ch && ch <= 0xe4b;
141 }
142
143 // Helper returns true if ch is a Thai vowel that may be followed by a tone
144 // mark.
IsThaiTonableVowel(char32 ch)145 static bool IsThaiTonableVowel(char32 ch) {
146 return (0xe34 <= ch && ch <= 0xe39) || ch == 0xe31;
147 }
148
149 // Helper returns true if the sequence prev_ch,ch is invalid Thai.
150 // These rules come from a native Thai speaker, and are not covered by the
151 // Thai section in the unicode book:
152 // http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
153 // Comments below added by Ray interpreting the code ranges.
154 /* static */
IsBadlyFormedThai(char32 prev_ch,char32 ch)155 bool ValidateGrapheme::IsBadlyFormedThai(char32 prev_ch, char32 ch) {
156 // Tone marks must follow consonants or specific vowels.
157 if (IsThaiToneMark(ch) && !(IsThaiConsonant(prev_ch) || IsThaiTonableVowel(prev_ch))) {
158 return true;
159 }
160 // Tonable vowels must follow consonants.
161 if ((IsThaiTonableVowel(ch) || ch == 0xe47) && !IsThaiConsonant(prev_ch)) {
162 return true;
163 }
164 // Thanthakhat must follow consonant or specific vowels.
165 if (ch == 0xe4c && !(IsThaiConsonant(prev_ch) || prev_ch == 0xe38 || prev_ch == 0xe34)) {
166 return true;
167 }
168 // Nikkhahit must follow a consonant ?or certain markers?.
169 // TODO(rays) confirm this, but there were so many in the ground truth of the
170 // validation set that it seems reasonable to assume it is valid.
171 if (ch == 0xe4d && !(IsThaiConsonant(prev_ch) || prev_ch == 0xe48 || prev_ch == 0xe49)) {
172 return true;
173 }
174 // The vowels e30, e32, e33 can be used more liberally.
175 if ((ch == 0xe30 || ch == 0xe32 || ch == 0xe33) &&
176 !(IsThaiConsonant(prev_ch) || IsThaiToneMark(prev_ch)) &&
177 !(prev_ch == 0xe32 && ch == 0xe30) && !(prev_ch == 0xe4d && ch == 0xe32)) {
178 return true;
179 }
180 // Some vowels come before consonants, and therefore cannot follow things
181 // that cannot end a syllable.
182 if (IsThaiBeforeConsonantVowel(ch) &&
183 (IsThaiBeforeConsonantVowel(prev_ch) || prev_ch == 0xe31 || prev_ch == 0xe37)) {
184 return true;
185 }
186 // Don't allow the standalone vowel U+0e24 to be followed by other vowels.
187 if ((0xe30 <= ch && ch <= 0xe4D) && prev_ch == 0xe24) {
188 return true;
189 }
190 return false;
191 }
192
193 } // namespace tesseract
194