1 // Copyright 2010-2018, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 //     * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 //     * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 //     * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 #ifndef MOZC_BASE_NUMBER_UTIL_H_
31 #define MOZC_BASE_NUMBER_UTIL_H_
32 
33 #include <string>
34 #include <vector>
35 
36 #include "base/port.h"
37 #include "base/string_piece.h"
38 
39 namespace mozc {
40 
41 // This class sets up utilities to manage strings including numbers like
42 // Arabic numbers, Roman numbers, Kanji numbers, and so on.
43 class NumberUtil {
44  public:
45   // Converts the string to a number and return it.
46   static int SimpleAtoi(StringPiece str);
47 
48   // Returns true if the given input_string contains only number characters
49   // (regardless of halfwidth or fullwidth).
50   // False for empty string.
51   static bool IsArabicNumber(StringPiece input_string);
52 
53   // Returns true if the given str consists of only ASCII digits.
54   // False for empty string.
55   static bool IsDecimalInteger(StringPiece str);
56 
57   struct NumberString {
58    public:
59     enum Style {
60         DEFAULT_STYLE = 0,
61         // 123,456,789
62         NUMBER_SEPARATED_ARABIC_HALFWIDTH,
63         // "123,456,789"
64         NUMBER_SEPARATED_ARABIC_FULLWIDTH,
65         // "123億456万7890"
66         NUMBER_ARABIC_AND_KANJI_HALFWIDTH,
67         // "123億456万7890"
68         NUMBER_ARABIC_AND_KANJI_FULLWIDTH,
69         // "一億二千三百四十五万六千七百八十九"
70         NUMBER_KANJI,
71         // "壱億弐千参百四拾五万六千七百八拾九"
72         NUMBER_OLD_KANJI,
73         // "ⅠⅡⅢ"
74         NUMBER_ROMAN_CAPITAL,
75         // "ⅰⅱⅲ"
76         NUMBER_ROMAN_SMALL,
77         // "①②③"
78         NUMBER_CIRCLED,
79         // "ニ〇〇"
80         NUMBER_KANJI_ARABIC,
81         // "0x4d2" (1234 in decimal)
82         NUMBER_HEX,
83         // "02322" (1234 in decimal)
84         NUMBER_OCT,
85         // "0b10011010010" (1234 in decimal)
86         NUMBER_BIN,
87     };
88 
NumberStringNumberString89     NumberString(StringPiece value, StringPiece description, Style style)
90         : value(value.as_string()),
91           description(description.as_string()),
92           style(style) {}
93 
94     // Converted string
95     string value;
96 
97     // Description of Converted String
98     string description;
99 
100     // Converted Number Style
101     Style style;
102   };
103 
104   // Following five functions are main functions to convert number strings.
105   // They receive two arguments:
106   //   - input_num: a string consisting of Arabic numeric characters.
107   //   - output: a vector consists of conveted results.
108   // If |input_num| is invalid or cannot represent as the form, these
109   // functions do nothing.  If a method finds more than one representations,
110   // it pushes all candidates into the output.
111 
112   // Converts half-width Arabic number string to Kan-su-ji string.
113   //   - input_num: a string which *must* be half-width number string.
114   //   - output: function appends new representation into output vector.
115   // value, desc and style are stored same size and same order.
116   // if invalid string is set, this function do nothing.
117   static bool ArabicToKanji(StringPiece input_num,
118                             std::vector<NumberString> *output);
119 
120   // Converts half-width Arabic number string to Separated Arabic string.
121   // (e.g. 1234567890 are converted to 1,234,567,890)
122   // Arguments are same as ArabicToKanji (above).
123   static bool ArabicToSeparatedArabic(StringPiece input_num,
124                                       std::vector<NumberString> *output);
125 
126   // Converts half-width Arabic number string to full-width Arabic number
127   // string.
128   // Arguments are same as ArabicToKanji (above).
129   static bool ArabicToWideArabic(StringPiece input_num,
130                                  std::vector<NumberString> *output);
131 
132   // Converts half-width Arabic number to various styles.
133   // Arguments are same as ArabicToKanji (above).
134   //   - Roman style (i) (ii) ...
135   static bool ArabicToOtherForms(StringPiece input_num,
136                                  std::vector<NumberString> *output);
137 
138   // Converts half-width Arabic number to various radices (2,8,16).
139   // Arguments are same as ArabicToKanji (above).
140   // Excepted number of input digits is smaller than 20, but it can be
141   // converted only if it can be stored in an unsigned 64-bit integer.
142   static bool ArabicToOtherRadixes(StringPiece input_num,
143                                    std::vector<NumberString> *output);
144 
145   // Converts the string to a 32-/64-bit signed/unsigned int.  Returns true if
146   // success or false if the string is in the wrong format.
147   static bool SafeStrToInt16(StringPiece str, int16 *value);
148   static bool SafeStrToInt32(StringPiece str, int32 *value);
149   static bool SafeStrToInt64(StringPiece str, int64 *value);
150   static bool SafeStrToUInt16(StringPiece str, uint16 *value);
151   static bool SafeStrToUInt32(StringPiece str, uint32 *value);
152   static bool SafeStrToUInt64(StringPiece str, uint64 *value);
153   static bool SafeHexStrToUInt32(StringPiece str, uint32 *value);
154   static bool SafeOctStrToUInt32(StringPiece str, uint32 *value);
155 
156   // Converts the string to a double.  Returns true if success or false if the
157   // string is in the wrong format.
158   // If |str| is a hexadecimal number like "0x1234", the result depends on
159   // compiler.  It returns false when compiled by VisualC++.  On the other hand
160   // it returns true and sets correct value when compiled by gcc.
161   static bool SafeStrToDouble(StringPiece str, double *value);
162 
163   // Convert Kanji numeric into Arabic numeric.
164   // When the trim_leading_zeros is true, leading zeros for arabic_output
165   // are trimmed off.
166   // TODO(toshiyuki): This parameter is only applied for arabic_output now.
167   //
168   // Input: "2千五百"
169   // kanji_output: "二千五百"
170   // arabic output: 2500
171   //
172   // NormalizeNumbers() returns false if it finds non-number characters.
173   // NormalizeNumbersWithSuffix() skips trailing non-number characters and
174   // return them in "suffix".
175   static bool NormalizeNumbers(StringPiece input,
176                                bool trim_leading_zeros,
177                                string *kanji_output,
178                                string *arabic_output);
179 
180   static bool NormalizeNumbersWithSuffix(StringPiece input,
181                                          bool trim_leading_zeros,
182                                          string *kanji_output,
183                                          string *arabic_output,
184                                          string *suffix);
185 
186   // Note: this function just does charcter-by-character conversion
187   // "百二十" -> 10020
188   static void KanjiNumberToArabicNumber(StringPiece input, string *output);
189 
190  private:
191   DISALLOW_IMPLICIT_CONSTRUCTORS(NumberUtil);
192 };
193 
194 }  // namespace mozc
195 
196 #endif  // MOZC_BASE_NUMBER_UTIL_H_
197