1 // MeCab -- Yet Another Part-of-Speech and Morphological Analyzer
2 //
3 //
4 // Copyright(C) 2001-2006 Taku Kudo <taku@chasen.org>
5 // Copyright(C) 2004-2006 Nippon Telegraph and Telephone Corporation
6 #ifndef MECAB_UTILS_H
7 #define MECAB_UTILS_H
8
9 #include <algorithm>
10 #include <cstdlib>
11 #include <cstdio>
12 #include <cstring>
13 #include <string>
14 #include <vector>
15 #include "common.h"
16
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20
21 #ifdef HAVE_STDINT_H
22 #include <stdint.h>
23 #else // HAVE_STDINT_H
24 #if defined(_WIN32) && !defined(__CYGWIN__)
25 #if defined(_MSC_VER) && (_MSC_VER <= 1500)
26 typedef unsigned char uint8_t;
27 typedef unsigned long uint32_t;
28 typedef unsigned long long uint64_t;
29 #else // _MSC_VER
30 #include <stdint.h>
31 #endif // _MSC_VER
32 #else // _WIN32
33 typedef unsigned char uint8_t;
34 typedef unsigned long uint32_t;
35 typedef unsigned __int64 uint64_t;
36 #endif // _WIN32
37 #endif // HAVE_STDINT_H
38
39 namespace MeCab {
40
41 class Param;
42
43 enum { EUC_JP, CP932, UTF8, UTF16, UTF16LE, UTF16BE, ASCII };
44 int decode_charset(const char *charset);
45
dtoa(double val,char * s)46 void inline dtoa(double val, char *s) {
47 std::sprintf(s, "%-16f", val);
48 char *p = s;
49 for (; *p != ' '; ++p) {}
50 *p = '\0';
51 return;
52 }
53
54 template <class T>
itoa(T val,char * s)55 inline void itoa(T val, char *s) {
56 char *t;
57 T mod;
58
59 if (val < 0) {
60 *s++ = '-';
61 val = -val;
62 }
63 t = s;
64
65 while (val) {
66 mod = val % 10;
67 *t++ = static_cast<char>(mod) + '0';
68 val /= 10;
69 }
70
71 if (s == t) *t++ = '0';
72 *t = '\0';
73 std::reverse(s, t);
74
75 return;
76 }
77
78 template <class T>
uitoa(T val,char * s)79 inline void uitoa(T val, char *s) {
80 char *t;
81 T mod;
82 t = s;
83 while (val) {
84 mod = val % 10;
85 *t++ = static_cast<char>(mod) + '0';
86 val /= 10;
87 }
88
89 if (s == t) *t++ = '0';
90 *t = '\0';
91 std::reverse(s, t);
92 return;
93 }
94
read_ptr(const char ** ptr,size_t size)95 inline const char *read_ptr(const char **ptr, size_t size) {
96 const char *r = *ptr;
97 *ptr += size;
98 return r;
99 }
100
101 template <class T>
read_static(const char ** ptr,T & value)102 inline void read_static(const char **ptr, T& value) {
103 const char *r = read_ptr(ptr, sizeof(T));
104 memcpy(&value, r, sizeof(T));
105 }
106
107 bool file_exists(const char *filename);
108
109 int load_request_type(const Param ¶m);
110
111 bool load_dictionary_resource(Param *);
112
113 bool escape_csv_element(std::string *w);
114
115 void enum_csv_dictionaries(const char *path,
116 std::vector<std::string> *dics);
117
118 int progress_bar(const char* message, size_t current, size_t total);
119
120 bool toLower(std::string *);
121
122 std::string create_filename(const std::string &path,
123 const std::string &file);
124 void remove_filename(std::string *s);
125 void remove_pathname(std::string *s);
126 void replace_string(std::string *s,
127 const std::string &src,
128 const std::string &dst);
129
130 template <class Iterator>
tokenizeCSV(char * str,Iterator out,size_t max)131 inline size_t tokenizeCSV(char *str,
132 Iterator out, size_t max) {
133 char *eos = str + std::strlen(str);
134 char *start = 0;
135 char *end = 0;
136 size_t n = 0;
137
138 for (; str < eos; ++str) {
139 // skip white spaces
140 while (*str == ' ' || *str == '\t') ++str;
141 if (*str == '"') {
142 start = ++str;
143 end = start;
144 for (; str < eos; ++str) {
145 if (*str == '"') {
146 str++;
147 if (*str != '"')
148 break;
149 }
150 *end++ = *str;
151 }
152 str = std::find(str, eos, ',');
153 } else {
154 start = str;
155 str = std::find(str, eos, ',');
156 end = str;
157 }
158 if (max-- > 1) *end = '\0';
159 *out++ = start;
160 ++n;
161 if (max == 0) break;
162 }
163
164 return n;
165 }
166
167 template <class Iterator>
tokenize(char * str,const char * del,Iterator out,size_t max)168 inline size_t tokenize(char *str, const char *del,
169 Iterator out, size_t max) {
170 char *stre = str + std::strlen(str);
171 const char *dele = del + std::strlen(del);
172 size_t size = 0;
173
174 while (size < max) {
175 char *n = std::find_first_of(str, stre, del, dele);
176 *n = '\0';
177 *out++ = str;
178 ++size;
179 if (n == stre) break;
180 str = n + 1;
181 }
182
183 return size;
184 }
185
186 // continus run of space is regarded as one space
187 template <class Iterator>
tokenize2(char * str,const char * del,Iterator out,size_t max)188 inline size_t tokenize2(char *str, const char *del,
189 Iterator out, size_t max) {
190 char *stre = str + std::strlen(str);
191 const char *dele = del + std::strlen(del);
192 size_t size = 0;
193
194 while (size < max) {
195 char *n = std::find_first_of(str, stre, del, dele);
196 *n = '\0';
197 if (*str != '\0') {
198 *out++ = str;
199 ++size;
200 }
201 if (n == stre) break;
202 str = n + 1;
203 }
204
205 return size;
206 }
207
logsumexp(double x,double y,bool flg)208 inline double logsumexp(double x, double y, bool flg) {
209 #define MINUS_LOG_EPSILON 50
210
211 if (flg) return y; // init mode
212 double vmin = std::min<double>(x, y);
213 double vmax = std::max<double>(x, y);
214 if (vmax > vmin + MINUS_LOG_EPSILON) {
215 return vmax;
216 } else {
217 return vmax + std::log(std::exp(vmin - vmax) + 1.0);
218 }
219 }
220
tocost(double d,int n)221 inline short int tocost(double d, int n) {
222 static const short max = +32767;
223 static const short min = -32767;
224 return static_cast<short>(std::max<double>(
225 std::min<double>(
226 -n * d,
227 static_cast<double>(max)),
228 static_cast<double>(min)) );
229 }
230
getEscapedChar(const char p)231 inline char getEscapedChar(const char p) {
232 switch (p) {
233 case '0': return '\0';
234 case 'a': return '\a';
235 case 'b': return '\b';
236 case 't': return '\t';
237 case 'n': return '\n';
238 case 'v': return '\v';
239 case 'f': return '\f';
240 case 'r': return '\r';
241 case 's': return ' ';
242 case '\\': return '\\';
243 default: break;
244 }
245
246 return '\0'; // never be here
247 }
248
249 // return 64 bit hash
250 uint64_t fingerprint(const char *str, size_t size);
251 uint64_t fingerprint(const std::string &str);
252
253 #if defined(_WIN32) && !defined(__CYGWIN__)
254 std::wstring Utf8ToWide(const std::string &input);
255 std::string WideToUtf8(const std::wstring &input);
256 #endif
257 }
258 #endif
259