1 #include "validate_grapheme.h"
2 #include "tprintf.h"
3 #include "unicode/uchar.h" // From libicu
4 
5 namespace tesseract {
6 
ConsumeGraphemeIfValid()7 bool ValidateGrapheme::ConsumeGraphemeIfValid() {
8   const unsigned num_codes = codes_.size();
9   char32 prev_prev_ch = ' ';
10   char32 prev_ch = ' ';
11   CharClass prev_cc = CharClass::kWhitespace;
12   int num_codes_in_grapheme = 0;
13   while (codes_used_ < num_codes) {
14     CharClass cc = codes_[codes_used_].first;
15     char32 ch = codes_[codes_used_].second;
16     const bool is_combiner = cc == CharClass::kCombiner || cc == CharClass::kVirama;
17 // TODO: Make this code work well with RTL text.
18 // See
19 // https://github.com/tesseract-ocr/tesseract/pull/2266#issuecomment-467114751
20 #if 0
21     // Reject easily detected badly formed sequences.
22     if (prev_cc == CharClass::kWhitespace && is_combiner) {
23       if (report_errors_) tprintf("Word started with a combiner:0x%x\n", ch);
24      return false;
25     }
26 #endif
27     if (prev_cc == CharClass::kVirama && cc == CharClass::kVirama) {
28       if (report_errors_) {
29         tprintf("Two grapheme links in a row:0x%x 0x%x\n", prev_ch, ch);
30       }
31       return false;
32     }
33     if (prev_cc != CharClass::kWhitespace && cc != CharClass::kWhitespace &&
34         IsBadlyFormed(prev_ch, ch)) {
35       return false;
36     }
37     bool prev_is_fwd_combiner = prev_ch == kZeroWidthJoiner || prev_cc == CharClass::kVirama ||
38                                 (prev_ch == kZeroWidthNonJoiner &&
39                                  (cc == CharClass::kVirama || prev_prev_ch == kZeroWidthJoiner));
40     if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner) {
41       break;
42     }
43     CodeOnlyToOutput();
44     ++num_codes_in_grapheme;
45     prev_prev_ch = prev_ch;
46     prev_ch = ch;
47     prev_cc = cc;
48   }
49   if (num_codes_in_grapheme > 0) {
50     MultiCodePart(num_codes_in_grapheme);
51   }
52   return true;
53 }
54 
UnicodeToCharClass(char32 ch) const55 Validator::CharClass ValidateGrapheme::UnicodeToCharClass(char32 ch) const {
56   if (IsVedicAccent(ch)) {
57     return CharClass::kVedicMark;
58   }
59   // The ZeroWidth[Non]Joiner characters are mapped to kCombiner as they
60   // always combine with the previous character.
61   if (u_hasBinaryProperty(ch, UCHAR_GRAPHEME_LINK)) {
62     return CharClass::kVirama;
63   }
64   if (u_isUWhiteSpace(ch)) {
65     return CharClass::kWhitespace;
66   }
67   // Workaround for Javanese Aksara's Taling, do not label it as a combiner
68   if (ch == 0xa9ba) {
69     return CharClass::kConsonant;
70   }
71   int char_type = u_charType(ch);
72   if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK ||
73       char_type == U_COMBINING_SPACING_MARK || ch == kZeroWidthNonJoiner ||
74       ch == kZeroWidthJoiner) {
75     return CharClass::kCombiner;
76   }
77   return CharClass::kOther;
78 }
79 
80 // Helper returns true if the sequence prev_ch,ch is invalid.
IsBadlyFormed(char32 prev_ch,char32 ch)81 bool ValidateGrapheme::IsBadlyFormed(char32 prev_ch, char32 ch) {
82   // Reject badly formed Indic vowels.
83   if (IsBadlyFormedIndicVowel(prev_ch, ch)) {
84     if (report_errors_) {
85       tprintf("Badly formed Indic vowel sequence:0x%x 0x%x\n", prev_ch, ch);
86     }
87     return true;
88   }
89   if (IsBadlyFormedThai(prev_ch, ch)) {
90     if (report_errors_) {
91       tprintf("Badly formed Thai:0x%x 0x%x\n", prev_ch, ch);
92     }
93     return true;
94   }
95   return false;
96 }
97 
98 // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
99 // Some vowels in Indic scripts may be analytically decomposed into atomic pairs
100 // of components that are themselves valid unicode symbols. (See Table 12-1 in
101 // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
102 // for examples in Devanagari). The Unicode standard discourages specifying
103 // vowels this way, but they are sometimes encountered in text, probably because
104 // some editors still permit it. Renderers however dislike such pairs, and so
105 // this function may be used to detect their occurrence for removal.
106 // TODO(rays) This function only covers a subset of Indic languages and doesn't
107 // include all rules. Add rules as appropriate to support other languages or
108 // find a way to generalize these existing rules that makes use of the
109 // regularity of the mapping from ISCII to Unicode.
110 /* static */
IsBadlyFormedIndicVowel(char32 prev_ch,char32 ch)111 bool ValidateGrapheme::IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch) {
112   return ((prev_ch == 0x905 && (ch == 0x946 || ch == 0x93E)) || (prev_ch == 0x909 && ch == 0x941) ||
113           (prev_ch == 0x90F && (ch >= 0x945 && ch <= 0x947)) ||
114           (prev_ch == 0x905 && (ch >= 0x949 && ch <= 0x94C)) ||
115           (prev_ch == 0x906 && (ch >= 0x949 && ch <= 0x94C)) ||
116           // Illegal combinations of two dependent Devanagari vowels.
117           (prev_ch == 0x93E && (ch >= 0x945 && ch <= 0x948)) ||
118           // Dependent Devanagari vowels following a virama.
119           (prev_ch == 0x94D && (ch >= 0x93E && ch <= 0x94C)) ||
120           // Bengali vowels (Table 9-5, pg 313)
121           (prev_ch == 0x985 && ch == 0x9BE) ||
122           // Telugu vowels (Table 9-19, pg 331)
123           (prev_ch == 0xC12 && (ch == 0xC55 || ch == 0xC4C)) ||
124           // Kannada vowels (Table 9-20, pg 332)
125           (prev_ch == 0xC92 && ch == 0xCCC));
126 }
127 
128 // Helper returns true if ch is a Thai consonant.
IsThaiConsonant(char32 ch)129 static bool IsThaiConsonant(char32 ch) {
130   return 0xe01 <= ch && ch <= 0xe2e;
131 }
132 
133 // Helper returns true is ch is a before-consonant vowel.
IsThaiBeforeConsonantVowel(char32 ch)134 static bool IsThaiBeforeConsonantVowel(char32 ch) {
135   return 0xe40 <= ch && ch <= 0xe44;
136 }
137 
138 // Helper returns true if ch is a Thai tone mark.
IsThaiToneMark(char32 ch)139 static bool IsThaiToneMark(char32 ch) {
140   return 0xe48 <= ch && ch <= 0xe4b;
141 }
142 
143 // Helper returns true if ch is a Thai vowel that may be followed by a tone
144 // mark.
IsThaiTonableVowel(char32 ch)145 static bool IsThaiTonableVowel(char32 ch) {
146   return (0xe34 <= ch && ch <= 0xe39) || ch == 0xe31;
147 }
148 
149 // Helper returns true if the sequence prev_ch,ch is invalid Thai.
150 // These rules come from a native Thai speaker, and are not covered by the
151 // Thai section in the unicode book:
152 // http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
153 // Comments below added by Ray interpreting the code ranges.
154 /* static */
IsBadlyFormedThai(char32 prev_ch,char32 ch)155 bool ValidateGrapheme::IsBadlyFormedThai(char32 prev_ch, char32 ch) {
156   // Tone marks must follow consonants or specific vowels.
157   if (IsThaiToneMark(ch) && !(IsThaiConsonant(prev_ch) || IsThaiTonableVowel(prev_ch))) {
158     return true;
159   }
160   // Tonable vowels must follow consonants.
161   if ((IsThaiTonableVowel(ch) || ch == 0xe47) && !IsThaiConsonant(prev_ch)) {
162     return true;
163   }
164   // Thanthakhat must follow consonant or specific vowels.
165   if (ch == 0xe4c && !(IsThaiConsonant(prev_ch) || prev_ch == 0xe38 || prev_ch == 0xe34)) {
166     return true;
167   }
168   // Nikkhahit must follow a consonant ?or certain markers?.
169   // TODO(rays) confirm this, but there were so many in the ground truth of the
170   // validation set that it seems reasonable to assume it is valid.
171   if (ch == 0xe4d && !(IsThaiConsonant(prev_ch) || prev_ch == 0xe48 || prev_ch == 0xe49)) {
172     return true;
173   }
174   // The vowels e30, e32, e33 can be used more liberally.
175   if ((ch == 0xe30 || ch == 0xe32 || ch == 0xe33) &&
176       !(IsThaiConsonant(prev_ch) || IsThaiToneMark(prev_ch)) &&
177       !(prev_ch == 0xe32 && ch == 0xe30) && !(prev_ch == 0xe4d && ch == 0xe32)) {
178     return true;
179   }
180   // Some vowels come before consonants, and therefore cannot follow things
181   // that cannot end a syllable.
182   if (IsThaiBeforeConsonantVowel(ch) &&
183       (IsThaiBeforeConsonantVowel(prev_ch) || prev_ch == 0xe31 || prev_ch == 0xe37)) {
184     return true;
185   }
186   // Don't allow the standalone vowel U+0e24 to be followed by other vowels.
187   if ((0xe30 <= ch && ch <= 0xe4D) && prev_ch == 0xe24) {
188     return true;
189   }
190   return false;
191 }
192 
193 } // namespace tesseract
194