1 /**********************************************************************
2  * File:        normstrngs.h
3  * Description: Utilities to normalize and manipulate UTF-32 and
4  *              UTF-8 strings.
5  * Author:      Ranjith Unnikrishnan
6  * Created:     Thu July 4 2013
7  *
8  * (C) Copyright 2013, Google Inc.
9  * Licensed under the Apache License, Version 2.0 (the "License");
10  * you may not use this file except in compliance with the License.
11  * You may obtain a copy of the License at
12  * http://www.apache.org/licenses/LICENSE-2.0
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  *
19  **********************************************************************/
20 
21 #ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_
22 #define TESSERACT_CCUTIL_NORMSTRNGS_H_
23 
24 #include "export.h"
25 
26 #include "validator.h"
27 
28 #include <string>
29 #include <vector>
30 
31 namespace tesseract {
32 
33 // The standard unicode normalizations.
34 enum class UnicodeNormMode {
35   kNFD,
36   kNFC,
37   kNFKD,
38   kNFKC,
39 };
40 
41 // To normalize away differences in punctuation that are ambiguous, like
42 // curly quotes and different widths of dash.
43 enum class OCRNorm {
44   kNone,
45   kNormalize,
46 };
47 
48 // To validate and normalize away some subtle differences that can occur in
49 // Indic scripts, eg ensuring that an explicit virama is always followed by
50 // a zero-width non-joiner.
51 enum class GraphemeNorm {
52   kNone,
53   kNormalize,
54 };
55 
56 // Normalizes a UTF8 string according to the given modes. Returns true on
57 // success. If false is returned, some failure or invalidity was present, and
58 // the result string is produced on a "best effort" basis.
59 TESS_UNICHARSET_TRAINING_API
60 bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
61                          GraphemeNorm grapheme_normalize, const char *str8,
62                          std::string *normalized);
63 
64 // Normalizes a UTF8 string according to the given modes and splits into
65 // graphemes according to g_mode. Returns true on success. If false is returned,
66 // some failure or invalidity was present, and the result string is produced on
67 // a "best effort" basis.
68 TESS_UNICHARSET_TRAINING_API
69 bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
70                                   GraphemeNormMode g_mode, bool report_errors, const char *str8,
71                                   std::vector<std::string> *graphemes);
72 
73 // Applies just the OCR-specific normalizations and return the normalized char.
74 char32 OCRNormalize(char32 ch);
75 
76 // Returns true if the OCRNormalized ch1 and ch2 are the same.
77 bool IsOCREquivalent(char32 ch1, char32 ch2);
78 
79 // Returns true if the value lies in the range of valid unicodes.
80 bool IsValidCodepoint(const char32 ch);
81 
82 // Returns true a code point has the White_Space Unicode property.
83 TESS_UNICHARSET_TRAINING_API
84 bool IsWhitespace(const char32 ch);
85 
86 // Returns true if every char in the given (null-terminated) string has the
87 // White_Space Unicode property.
88 TESS_UNICHARSET_TRAINING_API
89 bool IsUTF8Whitespace(const char *text);
90 
91 // Returns the length of bytes of the prefix of 'text' that have the White_Space
92 // unicode property.
93 TESS_UNICHARSET_TRAINING_API
94 unsigned int SpanUTF8Whitespace(const char *text);
95 
96 // Returns the length of bytes of the prefix of 'text' that DO NOT have the
97 // White_Space unicode property.
98 TESS_UNICHARSET_TRAINING_API
99 unsigned int SpanUTF8NotWhitespace(const char *text);
100 
101 // Returns true if the char is interchange valid i.e. no C0 or C1 control codes
102 // (other than CR LF HT FF) and no non-characters.
103 TESS_UNICHARSET_TRAINING_API
104 bool IsInterchangeValid(const char32 ch);
105 
106 // Same as above but restricted to 7-bit ASCII.
107 TESS_UNICHARSET_TRAINING_API
108 bool IsInterchangeValid7BitAscii(const char32 ch);
109 
110 // Convert a full-width UTF-8 string to half-width.
111 TESS_UNICHARSET_TRAINING_API
112 char32 FullwidthToHalfwidth(const char32 ch);
113 
114 } // namespace tesseract
115 
116 #endif // TESSERACT_CCUTIL_NORMSTRNGS_H_
117